<a href="https://colab.research.google.com/github/ChiccoSy/BERT_Based_Multiclass_Text_Classification/blob/main/ALBERT_Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas
!pip install numpy
!pip install torch
!pip install nltk
!pip install scikit-learn
!pip install transformers
!pip install beautifulsoup4
!pip install contractions
!pip install sentencepiece

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted1.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6712955236434937, Average Training Loss: 1.6712955236434937, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.7005248069763184, Average Training Loss: 1.685910165309906, Training Accuracy: 0.125
Epoch 1/1, Batch Loss: 1.7955621480941772, Average Training Loss: 1.7224608262379963, Training Accuracy: 0.125
Epoch 1/1, Batch Loss: 1.592206597328186, Average Training Loss: 1.6898972690105438, Training Accuracy: 0.109375
Epoch 1/1, Batch Loss: 1.6625741720199585, Average Training Loss: 1.6844326496124267, Training Accuracy: 0.125
Epoch 1/1, Batch Loss: 1.6397384405136108, Average Training Loss: 1.676983614762624, Training Accuracy: 0.13541666666666666
Epoch 1/1, Batch Loss: 1.5291353464126587, Average Training Loss: 1.6558624335697718, Training Accuracy: 0.16071428571428573
Epoch 1/1, Batch Loss: 1.4934687614440918, Average Training Loss: 1.635563224554062, Training Accuracy: 0.1953125
Epoch 1/1, Batch Loss: 1.6267503499984741, Average Training Loss: 1.634584016270

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.4180451127819549
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.48      0.20      0.28       132
                Educational Opportunity       0.36      0.63      0.46       138
                         Family Support       0.61      0.57      0.59       133
                      Financial Support       0.40      0.27      0.32       130
                 Program Implementation       0.35      0.41      0.38       132

                               accuracy                           0.42       665
                              macro avg       0.44      0.42      0.40       665
                           weighted avg       0.44      0.42      0.41       665

Test Confusion Matrix:
[[26 57 12 11 26]
 [10 87  6 18 17]
 [ 6 21 76  7 23]
 [ 3 36 20 35 36]
 [ 9 42 10 17 54]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/alBERTPredicted1.csv


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted2.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.646173119544983, Average Training Loss: 1.646173119544983, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6257387399673462, Average Training Loss: 1.6359559297561646, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.8682265281677246, Average Training Loss: 1.7133794625600178, Training Accuracy: 0.14583333333333334
Epoch 1/1, Batch Loss: 1.6223406791687012, Average Training Loss: 1.6906197667121887, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.5780221223831177, Average Training Loss: 1.6681002378463745, Training Accuracy: 0.2125
Epoch 1/1, Batch Loss: 1.6265087127685547, Average Training Loss: 1.6611683170000713, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.6946278810501099, Average Training Loss: 1.6659482547215052, Training Accuracy: 0.20535714285714285
Epoch 1/1, Batch Loss: 1.5273016691207886, Average Training Loss: 1.6486174315214157, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.6394578218460083, Average Training Loss: 1.647599697

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.3533834586466165
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.33      0.33      0.33       132
                Educational Opportunity       0.29      0.47      0.36       138
                         Family Support       0.39      0.17      0.24       133
                      Financial Support       0.42      0.38      0.40       130
                 Program Implementation       0.41      0.41      0.41       132

                               accuracy                           0.35       665
                              macro avg       0.37      0.35      0.35       665
                           weighted avg       0.37      0.35      0.35       665

Test Confusion Matrix:
[[43 43 10 16 20]
 [33 65 10 14 16]
 [32 33 23 21 24]
 [15 37 11 50 17]
 [ 9 47  5 17 54]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/alBERTPredicted2.csv


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted3.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.8201982975006104, Average Training Loss: 1.8201982975006104, Training Accuracy: 0.0625
Epoch 1/1, Batch Loss: 1.6625497341156006, Average Training Loss: 1.7413740158081055, Training Accuracy: 0.140625
Epoch 1/1, Batch Loss: 1.6621750593185425, Average Training Loss: 1.7149743636449177, Training Accuracy: 0.13541666666666666
Epoch 1/1, Batch Loss: 1.6168330907821655, Average Training Loss: 1.6904390454292297, Training Accuracy: 0.140625
Epoch 1/1, Batch Loss: 1.6520626544952393, Average Training Loss: 1.6827637672424316, Training Accuracy: 0.1375
Epoch 1/1, Batch Loss: 1.6143606901168823, Average Training Loss: 1.6713632543881733, Training Accuracy: 0.15104166666666666
Epoch 1/1, Batch Loss: 1.55271577835083, Average Training Loss: 1.6544136149542672, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6270487308502197, Average Training Loss: 1.6509930044412613, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.578371524810791, Average Training Loss: 1.642923951

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.39849624060150374
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.37      0.29      0.32       132
                Educational Opportunity       0.35      0.46      0.39       138
                         Family Support       0.54      0.53      0.53       133
                      Financial Support       0.37      0.35      0.36       130
                 Program Implementation       0.38      0.36      0.37       132

                               accuracy                           0.40       665
                              macro avg       0.40      0.40      0.40       665
                           weighted avg       0.40      0.40      0.40       665

Test Confusion Matrix:
[[38 34 10 24 26]
 [27 63 15 11 22]
 [13 13 70 29  8]
 [13 28 21 46 22]
 [11 44 14 15 48]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/alBERTPredicted3.csv


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted4.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.7173155546188354, Average Training Loss: 1.7173155546188354, Training Accuracy: 0.09375
Epoch 1/1, Batch Loss: 1.6605349779129028, Average Training Loss: 1.6889252662658691, Training Accuracy: 0.125
Epoch 1/1, Batch Loss: 1.7298566102981567, Average Training Loss: 1.702569047609965, Training Accuracy: 0.11458333333333333
Epoch 1/1, Batch Loss: 1.5824670791625977, Average Training Loss: 1.6725435554981232, Training Accuracy: 0.1328125
Epoch 1/1, Batch Loss: 1.6430907249450684, Average Training Loss: 1.6666529893875122, Training Accuracy: 0.14375
Epoch 1/1, Batch Loss: 1.5998774766921997, Average Training Loss: 1.6555237372716267, Training Accuracy: 0.16145833333333334
Epoch 1/1, Batch Loss: 1.6678109169006348, Average Training Loss: 1.6572790486471993, Training Accuracy: 0.15178571428571427
Epoch 1/1, Batch Loss: 1.6099072694778442, Average Training Loss: 1.65135757625103, Training Accuracy: 0.15234375
Epoch 1/1, Batch Loss: 1.614555835723877, Average Training L

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  text = BeautifulSoup(text, 'html.parser').get_text()


Epoch 1/1, Validation Loss: 27.37865900993347, Validation Accuracy: 0.18233082706766918
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.19      0.52      0.28       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.17      0.42      0.25       106
                 Program Implementation       0.00      0.00      0.00       102

                               accuracy                           0.18       532
                              macro avg       0.07      0.19      0.10       532
                           weighted avg       0.07      0.18      0.10       532

Validation Confusion Matrix:
[[ 0 73  0 39  0]
 [ 0 53  0 49  0]
 [ 0 48  0 62  0]
 [ 0 62  0 44  0]
 [ 0 43  0 59  0]]
Test Accuracy: 0.204511278

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted6.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.8476096391677856, Average Training Loss: 1.8476096391677856, Training Accuracy: 0.0625
Epoch 1/1, Batch Loss: 1.6109908819198608, Average Training Loss: 1.7293002605438232, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.7284486293792725, Average Training Loss: 1.729016383488973, Training Accuracy: 0.14583333333333334
Epoch 1/1, Batch Loss: 1.7170947790145874, Average Training Loss: 1.7260359823703766, Training Accuracy: 0.140625
Epoch 1/1, Batch Loss: 1.6583185195922852, Average Training Loss: 1.7124924898147582, Training Accuracy: 0.1375
Epoch 1/1, Batch Loss: 1.6077477931976318, Average Training Loss: 1.6950350403785706, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.6026540994644165, Average Training Loss: 1.68183776310512, Training Accuracy: 0.17857142857142858
Epoch 1/1, Batch Loss: 1.6836179494857788, Average Training Loss: 1.6820602864027023, Training Accuracy: 0.1796875
Epoch 1/1, Batch Loss: 1.6488018035888672, Average Training Loss: 1.678364

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  text = BeautifulSoup(text, 'html.parser').get_text()


Epoch 1/1, Validation Loss: 54.55524396896362, Validation Accuracy: 0.26127819548872183
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.22      1.00      0.36       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.91      0.28      0.43       102

                               accuracy                           0.26       532
                              macro avg       0.23      0.26      0.16       532
                           weighted avg       0.22      0.26      0.16       532

Validation Confusion Matrix:
[[  0   2 110   0   0]
 [  0   0 101   0   1]
 [  0   0 110   0   0]
 [  0   2 102   0   2]
 [  0   3  70   0  29]]
T

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted4.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.7835159301757812, Average Training Loss: 1.7835159301757812, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6728261709213257, Average Training Loss: 1.7281710505485535, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.7721705436706543, Average Training Loss: 1.7428375482559204, Training Accuracy: 0.20833333333333334
Epoch 1/1, Batch Loss: 1.6996140480041504, Average Training Loss: 1.732031673192978, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.6084420680999756, Average Training Loss: 1.7073137521743775, Training Accuracy: 0.225
Epoch 1/1, Batch Loss: 1.682892084121704, Average Training Loss: 1.7032434741655986, Training Accuracy: 0.20833333333333334
Epoch 1/1, Batch Loss: 1.6428420543670654, Average Training Loss: 1.6946146999086653, Training Accuracy: 0.20535714285714285
Epoch 1/1, Batch Loss: 1.6564208269119263, Average Training Loss: 1.6898404657840729, Training Accuracy: 0.2109375
Epoch 1/1, Batch Loss: 1.59413743019104, Average Training Loss

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.5458646616541354
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.70      0.24      0.36       132
                Educational Opportunity       0.40      0.57      0.47       138
                         Family Support       0.89      0.79      0.84       133
                      Financial Support       0.49      0.71      0.58       130
                 Program Implementation       0.48      0.42      0.45       132

                               accuracy                           0.55       665
                              macro avg       0.59      0.55      0.54       665
                           weighted avg       0.59      0.55      0.54       665

Test Confusion Matrix:
[[ 32  44   4  26  26]
 [  6  78   4  34  16]
 [  2   4 105  13   9]
 [  0  25   3  92  10]
 [  6  45   2  23  56]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted6.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6432163715362549, Average Training Loss: 1.6432163715362549, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6913217306137085, Average Training Loss: 1.6672690510749817, Training Accuracy: 0.171875
Epoch 1/1, Batch Loss: 1.7122589349746704, Average Training Loss: 1.6822656790415447, Training Accuracy: 0.17708333333333334
Epoch 1/1, Batch Loss: 1.6252217292785645, Average Training Loss: 1.6680046916007996, Training Accuracy: 0.1796875
Epoch 1/1, Batch Loss: 1.5829957723617554, Average Training Loss: 1.6510029077529906, Training Accuracy: 0.19375
Epoch 1/1, Batch Loss: 1.633655309677124, Average Training Loss: 1.648111641407013, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.600424885749817, Average Training Loss: 1.6412992477416992, Training Accuracy: 0.20535714285714285
Epoch 1/1, Batch Loss: 1.6280783414840698, Average Training Loss: 1.6396466344594955, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.5390247106552124, Average Training Loss: 1.62846

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.5157894736842106
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.46      0.23      0.31       132
                Educational Opportunity       0.33      0.43      0.38       138
                         Family Support       0.77      0.91      0.83       133
                      Financial Support       0.55      0.41      0.47       130
                 Program Implementation       0.48      0.59      0.53       132

                               accuracy                           0.52       665
                              macro avg       0.52      0.52      0.50       665
                           weighted avg       0.52      0.52      0.50       665

Test Confusion Matrix:
[[ 31  52   5  13  31]
 [ 18  60   9  19  32]
 [  1   8 121   1   2]
 [  8  31  19  53  19]
 [ 10  30   3  11  78]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted7.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6995556354522705, Average Training Loss: 1.6995556354522705, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.747778296470642, Average Training Loss: 1.7236669659614563, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.6239254474639893, Average Training Loss: 1.6904197931289673, Training Accuracy: 0.22916666666666666
Epoch 1/1, Batch Loss: 1.5619579553604126, Average Training Loss: 1.6583043336868286, Training Accuracy: 0.265625
Epoch 1/1, Batch Loss: 1.5782424211502075, Average Training Loss: 1.6422919511795044, Training Accuracy: 0.28125
Epoch 1/1, Batch Loss: 1.6245133876800537, Average Training Loss: 1.6393288572629292, Training Accuracy: 0.2708333333333333
Epoch 1/1, Batch Loss: 1.5702881813049316, Average Training Loss: 1.629465903554644, Training Accuracy: 0.29017857142857145
Epoch 1/1, Batch Loss: 1.6387985944747925, Average Training Loss: 1.6306324899196625, Training Accuracy: 0.28515625
Epoch 1/1, Batch Loss: 1.621112585067749, Average Training

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.43007518796992483
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.23      0.17      0.20       132
                Educational Opportunity       0.33      0.64      0.43       138
                         Family Support       0.60      0.58      0.59       133
                      Financial Support       0.66      0.35      0.46       130
                 Program Implementation       0.54      0.39      0.45       132

                               accuracy                           0.43       665
                              macro avg       0.47      0.43      0.43       665
                           weighted avg       0.47      0.43      0.43       665

Test Confusion Matrix:
[[23 78  7  6 18]
 [20 89 10  6 13]
 [29 17 77  5  5]
 [18 28 31 46  7]
 [ 9 61  4  7 51]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/alBERTPredicted7.csv


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted8.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.5697640180587769, Average Training Loss: 1.5697640180587769, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.5938040018081665, Average Training Loss: 1.5817840099334717, Training Accuracy: 0.28125
Epoch 1/1, Batch Loss: 1.6218210458755493, Average Training Loss: 1.5951296885808308, Training Accuracy: 0.2708333333333333
Epoch 1/1, Batch Loss: 1.6517627239227295, Average Training Loss: 1.6092879474163055, Training Accuracy: 0.2578125
Epoch 1/1, Batch Loss: 1.5918720960617065, Average Training Loss: 1.6058047771453858, Training Accuracy: 0.2625
Epoch 1/1, Batch Loss: 1.5741701126098633, Average Training Loss: 1.600532333056132, Training Accuracy: 0.2864583333333333
Epoch 1/1, Batch Loss: 1.6805471181869507, Average Training Loss: 1.6119630166462489, Training Accuracy: 0.2767857142857143
Epoch 1/1, Batch Loss: 1.5575358867645264, Average Training Loss: 1.6051596254110336, Training Accuracy: 0.28515625
Epoch 1/1, Batch Loss: 1.662888765335083, Average Training Loss

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.58796992481203
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.49      0.30      0.37       132
                Educational Opportunity       0.40      0.42      0.41       138
                         Family Support       0.94      0.94      0.94       133
                      Financial Support       0.63      0.71      0.66       130
                 Program Implementation       0.48      0.58      0.53       132

                               accuracy                           0.59       665
                              macro avg       0.59      0.59      0.58       665
                           weighted avg       0.59      0.59      0.58       665

Test Confusion Matrix:
[[ 39  36   3  15  39]
 [ 17  58   4  25  34]
 [  0   4 125   3   1]
 [ 11  16   1  92  10]
 [ 12  31   0  12  77]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/alBE

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted10.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.742398977279663, Average Training Loss: 1.742398977279663, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6051628589630127, Average Training Loss: 1.673780918121338, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.617234468460083, Average Training Loss: 1.6549321015675862, Training Accuracy: 0.20833333333333334
Epoch 1/1, Batch Loss: 1.6243963241577148, Average Training Loss: 1.6472981572151184, Training Accuracy: 0.2109375
Epoch 1/1, Batch Loss: 1.6500911712646484, Average Training Loss: 1.6478567600250245, Training Accuracy: 0.2125
Epoch 1/1, Batch Loss: 1.5605908632278442, Average Training Loss: 1.6333124438921611, Training Accuracy: 0.22916666666666666
Epoch 1/1, Batch Loss: 1.5502517223358154, Average Training Loss: 1.621446626526969, Training Accuracy: 0.23660714285714285
Epoch 1/1, Batch Loss: 1.559890627861023, Average Training Loss: 1.6137521266937256, Training Accuracy: 0.23828125
Epoch 1/1, Batch Loss: 1.6839953660964966, Average Training Lo

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.5082706766917293
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.44      0.74      0.55       132
                Educational Opportunity       0.39      0.33      0.36       138
                         Family Support       0.64      0.83      0.72       133
                      Financial Support       0.66      0.25      0.37       130
                 Program Implementation       0.50      0.39      0.44       132

                               accuracy                           0.51       665
                              macro avg       0.53      0.51      0.49       665
                           weighted avg       0.52      0.51      0.49       665

Test Confusion Matrix:
[[ 98  12   6   4  12]
 [ 61  46  10   4  17]
 [  8  10 110   3   2]
 [ 17  20  40  33  20]
 [ 41  29   5   6  51]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=4e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted11.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.5784037113189697, Average Training Loss: 1.5784037113189697, Training Accuracy: 0.28125
Epoch 1/1, Batch Loss: 1.7715948820114136, Average Training Loss: 1.6749992966651917, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.640213966369629, Average Training Loss: 1.6634041865666707, Training Accuracy: 0.13541666666666666
Epoch 1/1, Batch Loss: 1.6949542760849, Average Training Loss: 1.671291708946228, Training Accuracy: 0.140625
Epoch 1/1, Batch Loss: 1.5555511713027954, Average Training Loss: 1.6481436014175415, Training Accuracy: 0.175
Epoch 1/1, Batch Loss: 1.6172927618026733, Average Training Loss: 1.6430017948150635, Training Accuracy: 0.17708333333333334
Epoch 1/1, Batch Loss: 1.66853928565979, Average Training Loss: 1.6466500077928816, Training Accuracy: 0.17410714285714285
Epoch 1/1, Batch Loss: 1.6553434133529663, Average Training Loss: 1.6477366834878922, Training Accuracy: 0.16796875
Epoch 1/1, Batch Loss: 1.6687172651290894, Average Training Loss: 

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.45864661654135336
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.56      0.30      0.39       132
                Educational Opportunity       0.40      0.52      0.45       138
                         Family Support       0.49      0.50      0.50       133
                      Financial Support       0.40      0.58      0.47       130
                 Program Implementation       0.60      0.38      0.46       132

                               accuracy                           0.46       665
                              macro avg       0.49      0.46      0.46       665
                           weighted avg       0.49      0.46      0.46       665

Test Confusion Matrix:
[[40 56  6 19 11]
 [17 72 18 18 13]
 [ 1  6 67 55  4]
 [ 1 12 35 76  6]
 [13 35 10 24 50]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/alBERTPredicted11.csv


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=4e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted12.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6139018535614014, Average Training Loss: 1.6139018535614014, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.6500853300094604, Average Training Loss: 1.631993591785431, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.6031718254089355, Average Training Loss: 1.6223863363265991, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.6510738134384155, Average Training Loss: 1.6295582056045532, Training Accuracy: 0.2109375
Epoch 1/1, Batch Loss: 1.5085265636444092, Average Training Loss: 1.6053518772125244, Training Accuracy: 0.2375
Epoch 1/1, Batch Loss: 1.6427714824676514, Average Training Loss: 1.611588478088379, Training Accuracy: 0.234375
Epoch 1/1, Batch Loss: 1.6117396354675293, Average Training Loss: 1.6116100719996862, Training Accuracy: 0.22767857142857142
Epoch 1/1, Batch Loss: 1.6986442804336548, Average Training Loss: 1.6224893480539322, Training Accuracy: 0.2265625
Epoch 1/1, Batch Loss: 1.497529149055481, Average Training Loss: 1.6086048814985487, 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  text = BeautifulSoup(text, 'html.parser').get_text()


Epoch 1/1, Validation Loss: 27.4209988117218, Validation Accuracy: 0.20676691729323307
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.21      1.00      0.34       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.00      0.00      0.00       102

                               accuracy                           0.21       532
                              macro avg       0.04      0.20      0.07       532
                           weighted avg       0.04      0.21      0.07       532

Validation Confusion Matrix:
[[  0   0 112   0   0]
 [  0   0 102   0   0]
 [  0   0 110   0   0]
 [  0   0 106   0   0]
 [  0   0 102   0   0]]
Te

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted13.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.7624506950378418, Average Training Loss: 1.7624506950378418, Training Accuracy: 0.125
Epoch 1/1, Batch Loss: 1.6171000003814697, Average Training Loss: 1.6897753477096558, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.712620496749878, Average Training Loss: 1.6973903973897297, Training Accuracy: 0.22916666666666666
Epoch 1/1, Batch Loss: 1.7008130550384521, Average Training Loss: 1.6982460618019104, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.510472297668457, Average Training Loss: 1.6606913089752198, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.5246309041976929, Average Training Loss: 1.6380145748456318, Training Accuracy: 0.2916666666666667
Epoch 1/1, Batch Loss: 1.6947047710418701, Average Training Loss: 1.6461131743022375, Training Accuracy: 0.25892857142857145
Epoch 1/1, Batch Loss: 1.686374306678772, Average Training Loss: 1.6511458158493042, Training Accuracy: 0.2421875
Epoch 1/1, Batch Loss: 1.6100472211837769, Average Training Loss: 

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6075187969924812
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.48      0.65      0.55       132
                Educational Opportunity       0.29      0.25      0.27       138
                         Family Support       0.92      0.98      0.95       133
                      Financial Support       0.72      0.71      0.71       130
                 Program Implementation       0.62      0.45      0.53       132

                               accuracy                           0.61       665
                              macro avg       0.61      0.61      0.60       665
                           weighted avg       0.60      0.61      0.60       665

Test Confusion Matrix:
[[ 86  25   3   7  11]
 [ 62  35   5  19  17]
 [  0   1 131   1   0]
 [  9  18   3  92   8]
 [ 21  41   1   9  60]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted14.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.5846383571624756, Average Training Loss: 1.5846383571624756, Training Accuracy: 0.3125
Epoch 1/1, Batch Loss: 1.5333812236785889, Average Training Loss: 1.5590097904205322, Training Accuracy: 0.3125
Epoch 1/1, Batch Loss: 1.5728882551193237, Average Training Loss: 1.5636359453201294, Training Accuracy: 0.22916666666666666
Epoch 1/1, Batch Loss: 1.7572271823883057, Average Training Loss: 1.6120337545871735, Training Accuracy: 0.234375
Epoch 1/1, Batch Loss: 1.8735454082489014, Average Training Loss: 1.664336085319519, Training Accuracy: 0.2
Epoch 1/1, Batch Loss: 1.6834509372711182, Average Training Loss: 1.667521893978119, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6376820802688599, Average Training Loss: 1.6632590634482247, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6255522966384888, Average Training Loss: 1.6585457175970078, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.6290591955184937, Average Training Loss: 1.6552694373660617, Traini

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6150375939849624
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.46      0.57      0.51       132
                Educational Opportunity       0.54      0.14      0.23       138
                         Family Support       0.92      0.98      0.95       133
                      Financial Support       0.58      0.87      0.70       130
                 Program Implementation       0.56      0.53      0.54       132

                               accuracy                           0.62       665
                              macro avg       0.61      0.62      0.58       665
                           weighted avg       0.61      0.62      0.58       665

Test Confusion Matrix:
[[ 75   7   3  20  27]
 [ 50  20   5  38  25]
 [  1   0 131   1   0]
 [  8   1   4 113   4]
 [ 30   9   0  23  70]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted15.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6717309951782227, Average Training Loss: 1.6717309951782227, Training Accuracy: 0.0625
Epoch 1/1, Batch Loss: 1.5667948722839355, Average Training Loss: 1.619262933731079, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.641671895980835, Average Training Loss: 1.626732587814331, Training Accuracy: 0.16666666666666666
Epoch 1/1, Batch Loss: 1.6521581411361694, Average Training Loss: 1.6330889761447906, Training Accuracy: 0.1796875
Epoch 1/1, Batch Loss: 1.6864961385726929, Average Training Loss: 1.643770408630371, Training Accuracy: 0.175
Epoch 1/1, Batch Loss: 1.6124614477157593, Average Training Loss: 1.6385522484779358, Training Accuracy: 0.17708333333333334
Epoch 1/1, Batch Loss: 1.6330761909484863, Average Training Loss: 1.6377699545451574, Training Accuracy: 0.17410714285714285
Epoch 1/1, Batch Loss: 1.6547795534133911, Average Training Loss: 1.6398961544036865, Training Accuracy: 0.1640625
Epoch 1/1, Batch Loss: 1.5770330429077148, Average Training Los

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  text = BeautifulSoup(text, 'html.parser').get_text()


Epoch 1/1, Validation Loss: 27.32491934299469, Validation Accuracy: 0.30451127819548873
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.24      0.25      0.24       110
                      Financial Support       0.40      0.46      0.43       106
                 Program Implementation       0.29      0.84      0.43       102

                               accuracy                           0.30       532
                              macro avg       0.19      0.31      0.22       532
                           weighted avg       0.18      0.30      0.22       532

Validation Confusion Matrix:
[[ 0  0 44 18 50]
 [ 0  0 26 26 50]
 [ 0  1 27 22 60]
 [ 0  0  9 49 48]
 [ 0  0  7  9 86]]
Test Accuracy: 0.275187969

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted16.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6299830675125122, Average Training Loss: 1.6299830675125122, Training Accuracy: 0.28125
Epoch 1/1, Batch Loss: 1.6440982818603516, Average Training Loss: 1.6370406746864319, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.6342008113861084, Average Training Loss: 1.636094053586324, Training Accuracy: 0.20833333333333334
Epoch 1/1, Batch Loss: 1.6313270330429077, Average Training Loss: 1.63490229845047, Training Accuracy: 0.2109375
Epoch 1/1, Batch Loss: 1.5670896768569946, Average Training Loss: 1.621339774131775, Training Accuracy: 0.225
Epoch 1/1, Batch Loss: 1.6354494094848633, Average Training Loss: 1.6236913800239563, Training Accuracy: 0.22395833333333334
Epoch 1/1, Batch Loss: 1.5740312337875366, Average Training Loss: 1.6165970734187536, Training Accuracy: 0.23214285714285715
Epoch 1/1, Batch Loss: 1.628104329109192, Average Training Loss: 1.6180354803800583, Training Accuracy: 0.2265625
Epoch 1/1, Batch Loss: 1.659591794013977, Average Training Loss

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  text = BeautifulSoup(text, 'html.parser').get_text()


Epoch 1/1, Validation Loss: 27.389304757118225, Validation Accuracy: 0.19172932330827067
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.19      1.00      0.32       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.00      0.00      0.00       102

                               accuracy                           0.19       532
                              macro avg       0.04      0.20      0.06       532
                           weighted avg       0.04      0.19      0.06       532

Validation Confusion Matrix:
[[  0 112   0   0   0]
 [  0 102   0   0   0]
 [  0 110   0   0   0]
 [  0 106   0   0   0]
 [  0 102   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted17.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.6234962940216064, Average Training Loss: 1.6234962940216064, Training Accuracy: 0.3125
Epoch 1/3, Batch Loss: 1.559312105178833, Average Training Loss: 1.5914041996002197, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.6067801713943481, Average Training Loss: 1.596529523531596, Training Accuracy: 0.2916666666666667
Epoch 1/3, Batch Loss: 1.659332036972046, Average Training Loss: 1.6122301518917084, Training Accuracy: 0.265625
Epoch 1/3, Batch Loss: 1.760433316230774, Average Training Loss: 1.6418707847595215, Training Accuracy: 0.2375
Epoch 1/3, Batch Loss: 1.6481890678405762, Average Training Loss: 1.6429238319396973, Training Accuracy: 0.22916666666666666
Epoch 1/3, Batch Loss: 1.6363648176193237, Average Training Loss: 1.6419868298939295, Training Accuracy: 0.20535714285714285
Epoch 1/3, Batch Loss: 1.6197459697723389, Average Training Loss: 1.6392067223787308, Training Accuracy: 0.203125
Epoch 1/3, Batch Loss: 1.6253713369369507, Average Training Loss: 1.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/3, Validation Loss: 54.77011466026306, Validation Accuracy: 0.19924812030075187
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.20      1.00      0.33       106
                 Program Implementation       0.00      0.00      0.00       102

                               accuracy                           0.20       532
                              macro avg       0.04      0.20      0.07       532
                           weighted avg       0.04      0.20      0.07       532

Validation Confusion Matrix:
[[  0   0   0 112   0]
 [  0   0   0 102   0]
 [  0   0   0 110   0]
 [  0   0   0 106   0]
 [  0   0   0 102   0]]
E

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/3, Validation Loss: 54.734732270240784, Validation Accuracy: 0.20676691729323307
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.21      1.00      0.34       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.00      0.00      0.00       102

                               accuracy                           0.21       532
                              macro avg       0.04      0.20      0.07       532
                           weighted avg       0.04      0.21      0.07       532

Validation Confusion Matrix:
[[  0   0 112   0   0]
 [  0   0 102   0   0]
 [  0   0 110   0   0]
 [  0   0 106   0   0]
 [  0   0 102   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  text = BeautifulSoup(text, 'html.parser').get_text()


Epoch 3/3, Validation Loss: 54.728535175323486, Validation Accuracy: 0.19172932330827067
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.19      1.00      0.32       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.00      0.00      0.00       102

                               accuracy                           0.19       532
                              macro avg       0.04      0.20      0.06       532
                           weighted avg       0.04      0.19      0.06       532

Validation Confusion Matrix:
[[  0 112   0   0   0]
 [  0 102   0   0   0]
 [  0 110   0   0   0]
 [  0 106   0   0   0]
 [  0 102   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted18.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.583209753036499, Average Training Loss: 1.583209753036499, Training Accuracy: 0.3125
Epoch 1/3, Batch Loss: 1.7620099782943726, Average Training Loss: 1.6726098656654358, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.6452397108078003, Average Training Loss: 1.6634864807128906, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.6156889200210571, Average Training Loss: 1.6515370905399323, Training Accuracy: 0.15625
Epoch 1/3, Batch Loss: 1.6399120092391968, Average Training Loss: 1.6492120742797851, Training Accuracy: 0.175
Epoch 1/3, Batch Loss: 1.6320054531097412, Average Training Loss: 1.6463443040847778, Training Accuracy: 0.16666666666666666
Epoch 1/3, Batch Loss: 1.6135940551757812, Average Training Loss: 1.6416656970977783, Training Accuracy: 0.15178571428571427
Epoch 1/3, Batch Loss: 1.6044169664382935, Average Training Loss: 1.6370096057653427, Training Accuracy: 0.1640625
Epoch 1/3, Batch Loss: 1.6368849277496338, Average Training Loss: 1.63699575

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/3, Validation Loss: 36.52332943677902, Validation Accuracy: 0.5827067669172933
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.33      0.66      0.44       102
                         Family Support       0.82      0.96      0.89       110
                      Financial Support       0.66      0.72      0.68       106
                 Program Implementation       0.75      0.60      0.67       102

                               accuracy                           0.58       532
                              macro avg       0.51      0.59      0.53       532
                           weighted avg       0.51      0.58      0.53       532

Validation Confusion Matrix:
[[  0  81   6  17   8]
 [  0  67  11  13  11]
 [  0   0 106   4   0]
 [  0  24   5  76   1]
 [  0  34   1   6  61]]
Ep

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6947368421052632
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.65      0.56      0.60       132
                Educational Opportunity       0.53      0.59      0.55       138
                         Family Support       0.95      0.97      0.96       133
                      Financial Support       0.64      0.83      0.72       130
                 Program Implementation       0.75      0.53      0.62       132

                               accuracy                           0.69       665
                              macro avg       0.70      0.70      0.69       665
                           weighted avg       0.70      0.69      0.69       665

Test Confusion Matrix:
[[ 74  29   2  15  12]
 [ 21  81   4  25   7]
 [  0   2 129   2   0]
 [  7  10   1 108   4]
 [ 12  32   0  18  70]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted19.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.7035026550292969, Average Training Loss: 1.7035026550292969, Training Accuracy: 0.09375
Epoch 1/3, Batch Loss: 1.6522715091705322, Average Training Loss: 1.6778870820999146, Training Accuracy: 0.09375
Epoch 1/3, Batch Loss: 1.575466513633728, Average Training Loss: 1.6437468926111858, Training Accuracy: 0.11458333333333333
Epoch 1/3, Batch Loss: 1.6529465913772583, Average Training Loss: 1.6460468173027039, Training Accuracy: 0.140625
Epoch 1/3, Batch Loss: 1.6727781295776367, Average Training Loss: 1.6513930797576903, Training Accuracy: 0.15625
Epoch 1/3, Batch Loss: 1.6038706302642822, Average Training Loss: 1.643472671508789, Training Accuracy: 0.19791666666666666
Epoch 1/3, Batch Loss: 1.6740167140960693, Average Training Loss: 1.647836106164115, Training Accuracy: 0.19196428571428573
Epoch 1/3, Batch Loss: 1.6025341749191284, Average Training Loss: 1.6421733647584915, Training Accuracy: 0.19140625
Epoch 1/3, Batch Loss: 1.5993269681930542, Average Training

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6661654135338346
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.64      0.61      0.63       132
                Educational Opportunity       0.43      0.53      0.47       138
                         Family Support       0.95      0.97      0.96       133
                      Financial Support       0.65      0.75      0.70       130
                 Program Implementation       0.76      0.47      0.58       132

                               accuracy                           0.67       665
                              macro avg       0.69      0.67      0.67       665
                           weighted avg       0.68      0.67      0.67       665

Test Confusion Matrix:
[[ 81  34   2  11   4]
 [ 30  73   4  21  10]
 [  1   1 129   2   0]
 [  7  18   1  98   6]
 [  7  45   0  18  62]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted20.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.7345936298370361, Average Training Loss: 1.7345936298370361, Training Accuracy: 0.09375
Epoch 1/3, Batch Loss: 1.7116172313690186, Average Training Loss: 1.7231054306030273, Training Accuracy: 0.15625
Epoch 1/3, Batch Loss: 1.664925456047058, Average Training Loss: 1.7037121057510376, Training Accuracy: 0.16666666666666666
Epoch 1/3, Batch Loss: 1.6435364484786987, Average Training Loss: 1.6886681914329529, Training Accuracy: 0.203125
Epoch 1/3, Batch Loss: 1.6325716972351074, Average Training Loss: 1.6774488925933837, Training Accuracy: 0.19375
Epoch 1/3, Batch Loss: 1.5271837711334229, Average Training Loss: 1.6524047056833904, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.5863703489303589, Average Training Loss: 1.642971226147243, Training Accuracy: 0.23660714285714285
Epoch 1/3, Batch Loss: 1.6147994995117188, Average Training Loss: 1.6394497603178024, Training Accuracy: 0.24609375
Epoch 1/3, Batch Loss: 1.655530571937561, Average Training Loss: 1.641

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6766917293233082
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.68      0.55      0.61       132
                Educational Opportunity       0.45      0.63      0.53       138
                         Family Support       0.97      0.95      0.96       133
                      Financial Support       0.74      0.75      0.75       130
                 Program Implementation       0.64      0.51      0.57       132

                               accuracy                           0.68       665
                              macro avg       0.70      0.68      0.68       665
                           weighted avg       0.69      0.68      0.68       665

Test Confusion Matrix:
[[ 72  35   2   8  15]
 [ 20  87   2  13  16]
 [  1   2 126   3   1]
 [  5  21   0  98   6]
 [  8  47   0  10  67]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted21.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.7556833028793335, Average Training Loss: 1.7556833028793335, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.6408766508102417, Average Training Loss: 1.6982799768447876, Training Accuracy: 0.3125
Epoch 1/3, Batch Loss: 1.6944282054901123, Average Training Loss: 1.6969960530598958, Training Accuracy: 0.2916666666666667
Epoch 1/3, Batch Loss: 1.639119029045105, Average Training Loss: 1.6825267970561981, Training Accuracy: 0.28125
Epoch 1/3, Batch Loss: 1.7023038864135742, Average Training Loss: 1.6864822149276733, Training Accuracy: 0.2625
Epoch 1/3, Batch Loss: 1.5356720685958862, Average Training Loss: 1.6613471905390422, Training Accuracy: 0.2708333333333333
Epoch 1/3, Batch Loss: 1.6256461143493652, Average Training Loss: 1.6562470367976598, Training Accuracy: 0.2767857142857143
Epoch 1/3, Batch Loss: 1.5801743268966675, Average Training Loss: 1.6467379480600357, Training Accuracy: 0.265625
Epoch 1/3, Batch Loss: 1.6510258913040161, Average Training Loss: 1.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/3, Validation Loss: 51.86420917510986, Validation Accuracy: 0.2518796992481203
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.20      0.96      0.33       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.88      0.35      0.50       102

                               accuracy                           0.25       532
                              macro avg       0.22      0.26      0.17       532
                           weighted avg       0.21      0.25      0.16       532

Validation Confusion Matrix:
[[  0 111   0   0   1]
 [  0  98   0   0   4]
 [  0 110   0   0   0]
 [  0 106   0   0   0]
 [  0  66   0   0  36]]
Ep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/3, Validation Loss: 51.395532846450806, Validation Accuracy: 0.2650375939849624
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.20      0.01      0.02       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.21      0.99      0.35       106
                 Program Implementation       0.92      0.34      0.50       102

                               accuracy                           0.27       532
                              macro avg       0.27      0.27      0.17       532
                           weighted avg       0.26      0.27      0.17       532

Validation Confusion Matrix:
[[  1   0   0 111   0]
 [  2   0   0  97   3]
 [  1   0   0 109   0]
 [  1   0   0 105   0]
 [  0   0   0  67  35]]
E

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3/3, Validation Loss: 49.637722969055176, Validation Accuracy: 0.325187969924812
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.24      0.96      0.39       110
                      Financial Support       0.40      0.04      0.07       106
                 Program Implementation       0.71      0.62      0.66       102

                               accuracy                           0.33       532
                              macro avg       0.27      0.32      0.22       532
                           weighted avg       0.27      0.33      0.22       532

Validation Confusion Matrix:
[[  0   0 103   2   7]
 [  0   0  87   1  14]
 [  0   0 106   1   3]
 [  0   0 100   4   2]
 [  0   0  37   2  63]]


  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.27518796992481204
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       132
                Educational Opportunity       0.00      0.00      0.00       138
                         Family Support       0.22      0.94      0.36       133
                      Financial Support       0.17      0.02      0.03       130
                 Program Implementation       0.62      0.42      0.50       132

                               accuracy                           0.28       665
                              macro avg       0.20      0.28      0.18       665
                           weighted avg       0.20      0.28      0.18       665

Test Confusion Matrix:
[[  0   0 118   3  11]
 [  0   0 123   1  14]
 [  0   0 125   2   6]
 [  0   1 124   2   3]
 [  0   0  72   4  56]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/a

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted22.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.720790147781372, Average Training Loss: 1.720790147781372, Training Accuracy: 0.125
Epoch 1/3, Batch Loss: 1.6942285299301147, Average Training Loss: 1.7075093388557434, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.5965369939804077, Average Training Loss: 1.6705185572306316, Training Accuracy: 0.2708333333333333
Epoch 1/3, Batch Loss: 1.7059752941131592, Average Training Loss: 1.6793827414512634, Training Accuracy: 0.265625
Epoch 1/3, Batch Loss: 1.701432466506958, Average Training Loss: 1.6837926864624024, Training Accuracy: 0.2375
Epoch 1/3, Batch Loss: 1.6420032978057861, Average Training Loss: 1.6768277883529663, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.7146075963974, Average Training Loss: 1.6822249037878854, Training Accuracy: 0.22321428571428573
Epoch 1/3, Batch Loss: 1.6298654079437256, Average Training Loss: 1.6756799668073654, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.564091682434082, Average Training Loss: 1.6632812685436673, 

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7203007518796992
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.69      0.77      0.72       132
                Educational Opportunity       0.53      0.59      0.55       138
                         Family Support       0.95      0.95      0.95       133
                      Financial Support       0.74      0.78      0.76       130
                 Program Implementation       0.74      0.52      0.61       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.72      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[101  13   2   7   9]
 [ 29  81   4  15   9]
 [  1   4 127   1   0]
 [  6  16   1 101   6]
 [ 10  40   0  13  69]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted23.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.7248613834381104, Average Training Loss: 1.7248613834381104, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.7074816226959229, Average Training Loss: 1.7161715030670166, Training Accuracy: 0.171875
Epoch 1/3, Batch Loss: 1.545632243156433, Average Training Loss: 1.659325083096822, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.6589164733886719, Average Training Loss: 1.6592229306697845, Training Accuracy: 0.234375
Epoch 1/3, Batch Loss: 1.7323391437530518, Average Training Loss: 1.673846173286438, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.5561299324035645, Average Training Loss: 1.654226799805959, Training Accuracy: 0.234375
Epoch 1/3, Batch Loss: 1.6595916748046875, Average Training Loss: 1.654993210520063, Training Accuracy: 0.24107142857142858
Epoch 1/3, Batch Loss: 1.7037240266799927, Average Training Loss: 1.6610845625400543, Training Accuracy: 0.2421875
Epoch 1/3, Batch Loss: 1.5393729209899902, Average Training Loss: 1.6475610468122694,

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/3, Validation Loss: 27.103740453720093, Validation Accuracy: 0.3082706766917293
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.27      0.77      0.40       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.36      0.71      0.48       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.00      0.00      0.00       102

                               accuracy                           0.31       532
                              macro avg       0.13      0.30      0.18       532
                           weighted avg       0.13      0.31      0.18       532

Validation Confusion Matrix:
[[86  0 26  0  0]
 [70  0 32  0  0]
 [32  0 78  0  0]
 [65  0 40  0  1]
 [61  0 41  0  0]]
Epoch 2/3, Batch Loss: 1.6

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6406015037593985
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.51      0.58      0.54       132
                Educational Opportunity       0.45      0.46      0.45       138
                         Family Support       0.95      0.95      0.95       133
                      Financial Support       0.65      0.78      0.71       130
                 Program Implementation       0.68      0.45      0.55       132

                               accuracy                           0.64       665
                              macro avg       0.65      0.64      0.64       665
                           weighted avg       0.65      0.64      0.64       665

Test Confusion Matrix:
[[ 76  23   2  17  14]
 [ 36  63   4  23  12]
 [  5   0 126   2   0]
 [ 13  13   1 101   2]
 [ 18  41   0  13  60]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted24.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.6319729089736938, Average Training Loss: 1.6319729089736938, Training Accuracy: 0.28125
Epoch 1/3, Batch Loss: 1.653063178062439, Average Training Loss: 1.6425180435180664, Training Accuracy: 0.234375
Epoch 1/3, Batch Loss: 1.5803924798965454, Average Training Loss: 1.6218095223108928, Training Accuracy: 0.2604166666666667
Epoch 1/3, Batch Loss: 1.6268131732940674, Average Training Loss: 1.6230604350566864, Training Accuracy: 0.2578125
Epoch 1/3, Batch Loss: 1.6230710744857788, Average Training Loss: 1.6230625629425048, Training Accuracy: 0.2375
Epoch 1/3, Batch Loss: 1.6135132312774658, Average Training Loss: 1.6214710076649983, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.5653491020202637, Average Training Loss: 1.6134535925728934, Training Accuracy: 0.27232142857142855
Epoch 1/3, Batch Loss: 1.5166380405426025, Average Training Loss: 1.601351648569107, Training Accuracy: 0.28515625
Epoch 1/3, Batch Loss: 1.6301697492599487, Average Training Loss: 1.60455

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6962406015037594
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.66      0.69      0.68       132
                Educational Opportunity       0.46      0.49      0.48       138
                         Family Support       0.94      0.97      0.96       133
                      Financial Support       0.69      0.82      0.75       130
                 Program Implementation       0.78      0.52      0.62       132

                               accuracy                           0.70       665
                              macro avg       0.71      0.70      0.70       665
                           weighted avg       0.70      0.70      0.69       665

Test Confusion Matrix:
[[ 91  26   3   6   6]
 [ 32  68   4  24  10]
 [  0   2 129   2   0]
 [  5  14   1 107   3]
 [  9  38   0  17  68]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted25.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.7149863243103027, Average Training Loss: 1.7149863243103027, Training Accuracy: 0.125
Epoch 1/3, Batch Loss: 1.6153644323349, Average Training Loss: 1.6651753783226013, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.6300609111785889, Average Training Loss: 1.653470555941264, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.5952715873718262, Average Training Loss: 1.6389208137989044, Training Accuracy: 0.203125
Epoch 1/3, Batch Loss: 1.561693549156189, Average Training Loss: 1.6234753608703614, Training Accuracy: 0.2375
Epoch 1/3, Batch Loss: 1.5630968809127808, Average Training Loss: 1.6134122808774312, Training Accuracy: 0.22916666666666666
Epoch 1/3, Batch Loss: 1.677355408668518, Average Training Loss: 1.622547013419015, Training Accuracy: 0.21428571428571427
Epoch 1/3, Batch Loss: 1.6357386112213135, Average Training Loss: 1.6241959631443024, Training Accuracy: 0.234375
Epoch 1/3, Batch Loss: 1.7467814683914185, Average Training Loss: 1.63781657483842

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7097744360902256
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.66      0.75      0.70       132
                Educational Opportunity       0.49      0.57      0.53       138
                         Family Support       0.95      0.98      0.96       133
                      Financial Support       0.75      0.80      0.78       130
                 Program Implementation       0.77      0.45      0.57       132

                               accuracy                           0.71       665
                              macro avg       0.72      0.71      0.71       665
                           weighted avg       0.72      0.71      0.71       665

Test Confusion Matrix:
[[ 99  19   3   5   6]
 [ 32  79   4  12  11]
 [  1   0 130   2   0]
 [  7  18   0 104   1]
 [ 11  46   0  15  60]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted26.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.7671796083450317, Average Training Loss: 1.7671796083450317, Training Accuracy: 0.375
Epoch 1/3, Batch Loss: 1.7388781309127808, Average Training Loss: 1.7530288696289062, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.5502890348434448, Average Training Loss: 1.6854489247004192, Training Accuracy: 0.22916666666666666
Epoch 1/3, Batch Loss: 1.6932462453842163, Average Training Loss: 1.6873982548713684, Training Accuracy: 0.203125
Epoch 1/3, Batch Loss: 1.6752853393554688, Average Training Loss: 1.6849756717681885, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.5147883892059326, Average Training Loss: 1.6566111246744792, Training Accuracy: 0.19791666666666666
Epoch 1/3, Batch Loss: 1.6619527339935303, Average Training Loss: 1.6573742117200578, Training Accuracy: 0.17857142857142858
Epoch 1/3, Batch Loss: 1.6054422855377197, Average Training Loss: 1.6508827209472656, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.6325973272323608, Average Training Los

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6917293233082706
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.67      0.67      0.67       132
                Educational Opportunity       0.47      0.51      0.49       138
                         Family Support       0.95      0.96      0.96       133
                      Financial Support       0.71      0.77      0.74       130
                 Program Implementation       0.68      0.56      0.61       132

                               accuracy                           0.69       665
                              macro avg       0.70      0.69      0.69       665
                           weighted avg       0.69      0.69      0.69       665

Test Confusion Matrix:
[[ 88  19   3   8  14]
 [ 30  70   3  20  15]
 [  1   2 128   2   0]
 [  4  19   1 100   6]
 [  9  38   0  11  74]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted27.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.7268366813659668, Average Training Loss: 1.7268366813659668, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.601344108581543, Average Training Loss: 1.6640903949737549, Training Accuracy: 0.28125
Epoch 1/3, Batch Loss: 1.6071070432662964, Average Training Loss: 1.645095944404602, Training Accuracy: 0.22916666666666666
Epoch 1/3, Batch Loss: 1.5958118438720703, Average Training Loss: 1.6327749192714691, Training Accuracy: 0.2421875
Epoch 1/3, Batch Loss: 1.634935736656189, Average Training Loss: 1.633207082748413, Training Accuracy: 0.225
Epoch 1/3, Batch Loss: 1.7574379444122314, Average Training Loss: 1.6539122263590496, Training Accuracy: 0.19791666666666666
Epoch 1/3, Batch Loss: 1.783591628074646, Average Training Loss: 1.6724378551755632, Training Accuracy: 0.17857142857142858
Epoch 1/3, Batch Loss: 1.6494179964065552, Average Training Loss: 1.6695603728294373, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.6917147636413574, Average Training Loss: 1.6

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.43759398496240604
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.49      0.30      0.37       132
                Educational Opportunity       0.39      0.47      0.43       138
                         Family Support       0.45      0.14      0.22       133
                      Financial Support       0.42      0.74      0.54       130
                 Program Implementation       0.49      0.54      0.51       132

                               accuracy                           0.44       665
                              macro avg       0.45      0.44      0.41       665
                           weighted avg       0.45      0.44      0.41       665

Test Confusion Matrix:
[[40 36  6 21 29]
 [20 65  5 30 18]
 [ 9 23 19 63 19]
 [ 5 15  5 96  9]
 [ 8 28  7 18 71]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/alBERTPredicted27.csv


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted28.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.602135419845581, Average Training Loss: 1.602135419845581, Training Accuracy: 0.28125
Epoch 1/3, Batch Loss: 1.7353559732437134, Average Training Loss: 1.6687456965446472, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.5906835794448853, Average Training Loss: 1.6427249908447266, Training Accuracy: 0.19791666666666666
Epoch 1/3, Batch Loss: 1.5529576539993286, Average Training Loss: 1.620283156633377, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.5968843698501587, Average Training Loss: 1.6156033992767334, Training Accuracy: 0.225
Epoch 1/3, Batch Loss: 1.5673458576202393, Average Training Loss: 1.6075604756673176, Training Accuracy: 0.234375
Epoch 1/3, Batch Loss: 1.60126793384552, Average Training Loss: 1.6066615411213465, Training Accuracy: 0.24553571428571427
Epoch 1/3, Batch Loss: 1.6330788135528564, Average Training Loss: 1.6099637001752853, Training Accuracy: 0.23828125
Epoch 1/3, Batch Loss: 1.5890148878097534, Average Training Loss: 1.60763605

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6330827067669172
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.68      0.54      0.60       132
                Educational Opportunity       0.41      0.49      0.45       138
                         Family Support       0.93      0.94      0.94       133
                      Financial Support       0.59      0.72      0.65       130
                 Program Implementation       0.62      0.48      0.54       132

                               accuracy                           0.63       665
                              macro avg       0.65      0.63      0.64       665
                           weighted avg       0.65      0.63      0.63       665

Test Confusion Matrix:
[[ 71  33   1  13  14]
 [ 18  68   5  28  19]
 [  2   1 125   5   0]
 [  4  23   3  93   7]
 [  9  41   0  18  64]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted8.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6242587566375732, Average Training Loss: 1.6242587566375732, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.6221619844436646, Average Training Loss: 1.623210370540619, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.6171437501907349, Average Training Loss: 1.6211881637573242, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.5416814088821411, Average Training Loss: 1.6013114750385284, Training Accuracy: 0.234375
Epoch 1/5, Batch Loss: 1.5478800535202026, Average Training Loss: 1.5906251907348632, Training Accuracy: 0.25
Epoch 1/5, Batch Loss: 1.6938408613204956, Average Training Loss: 1.6078278024991353, Training Accuracy: 0.22916666666666666
Epoch 1/5, Batch Loss: 1.6045254468917847, Average Training Loss: 1.607356037412371, Training Accuracy: 0.25
Epoch 1/5, Batch Loss: 1.6607087850570679, Average Training Loss: 1.614025130867958, Training Accuracy: 0.234375
Epoch 1/5, Batch Loss: 1.6655369997024536, Average Training Loss: 1.6197486718495686, Trainin

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7203007518796992
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.71      0.70      0.70       132
                Educational Opportunity       0.56      0.57      0.56       138
                         Family Support       0.95      0.97      0.96       133
                      Financial Support       0.70      0.84      0.76       130
                 Program Implementation       0.69      0.53      0.60       132

                               accuracy                           0.72       665
                              macro avg       0.72      0.72      0.72       665
                           weighted avg       0.72      0.72      0.72       665

Test Confusion Matrix:
[[ 92  15   3   9  13]
 [ 24  79   4  18  13]
 [  0   1 129   3   0]
 [  5  10   0 109   6]
 [  9  37   0  16  70]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted30.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6723134517669678, Average Training Loss: 1.6723134517669678, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.5720514059066772, Average Training Loss: 1.6221824288368225, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.607093334197998, Average Training Loss: 1.617152730623881, Training Accuracy: 0.22916666666666666
Epoch 1/5, Batch Loss: 1.6303611993789673, Average Training Loss: 1.6204548478126526, Training Accuracy: 0.21875
Epoch 1/5, Batch Loss: 1.6970375776290894, Average Training Loss: 1.63577139377594, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.730412244796753, Average Training Loss: 1.6515448689460754, Training Accuracy: 0.17708333333333334
Epoch 1/5, Batch Loss: 1.5967410802841187, Average Training Loss: 1.6437157562800817, Training Accuracy: 0.16964285714285715
Epoch 1/5, Batch Loss: 1.6276072263717651, Average Training Loss: 1.641702190041542, Training Accuracy: 0.1796875
Epoch 1/5, Batch Loss: 1.648931622505188, Average Training Loss: 1.6

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7037593984962406
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.69      0.66      0.67       132
                Educational Opportunity       0.49      0.59      0.53       138
                         Family Support       0.96      0.97      0.96       133
                      Financial Support       0.73      0.82      0.77       130
                 Program Implementation       0.70      0.48      0.57       132

                               accuracy                           0.70       665
                              macro avg       0.71      0.70      0.70       665
                           weighted avg       0.71      0.70      0.70       665

Test Confusion Matrix:
[[ 87  26   3   6  10]
 [ 20  81   2  21  14]
 [  0   3 129   1   0]
 [  7  12   1 107   3]
 [ 12  44   0  12  64]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted31.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.652092456817627, Average Training Loss: 1.652092456817627, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.6552802324295044, Average Training Loss: 1.6536863446235657, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.5935444831848145, Average Training Loss: 1.6336390574773152, Training Accuracy: 0.13541666666666666
Epoch 1/5, Batch Loss: 1.6179311275482178, Average Training Loss: 1.629712074995041, Training Accuracy: 0.1640625
Epoch 1/5, Batch Loss: 1.523504376411438, Average Training Loss: 1.6084705352783204, Training Accuracy: 0.23125
Epoch 1/5, Batch Loss: 1.643082618713379, Average Training Loss: 1.61423921585083, Training Accuracy: 0.22916666666666666
Epoch 1/5, Batch Loss: 1.5851445198059082, Average Training Loss: 1.6100828307015556, Training Accuracy: 0.21428571428571427
Epoch 1/5, Batch Loss: 1.615753173828125, Average Training Loss: 1.6107916235923767, Training Accuracy: 0.21875
Epoch 1/5, Batch Loss: 1.6235066652297974, Average Training Loss: 1.

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6481203007518797
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.56      0.52      0.54       132
                Educational Opportunity       0.52      0.50      0.51       138
                         Family Support       0.96      0.97      0.96       133
                      Financial Support       0.64      0.72      0.67       130
                 Program Implementation       0.56      0.54      0.55       132

                               accuracy                           0.65       665
                              macro avg       0.65      0.65      0.65       665
                           weighted avg       0.64      0.65      0.65       665

Test Confusion Matrix:
[[ 69  22   3  15  23]
 [ 25  69   2  21  21]
 [  2   0 129   2   0]
 [ 11  13   1  93  12]
 [ 17  29   0  15  71]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted32.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6742693185806274, Average Training Loss: 1.6742693185806274, Training Accuracy: 0.09375
Epoch 1/5, Batch Loss: 1.617430567741394, Average Training Loss: 1.6458499431610107, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.6351368427276611, Average Training Loss: 1.6422789096832275, Training Accuracy: 0.16666666666666666
Epoch 1/5, Batch Loss: 1.606621503829956, Average Training Loss: 1.6333645582199097, Training Accuracy: 0.1953125
Epoch 1/5, Batch Loss: 1.6258810758590698, Average Training Loss: 1.6318678617477418, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.6566317081451416, Average Training Loss: 1.6359951694806416, Training Accuracy: 0.17708333333333334
Epoch 1/5, Batch Loss: 1.6096996068954468, Average Training Loss: 1.6322386605398995, Training Accuracy: 0.19196428571428573
Epoch 1/5, Batch Loss: 1.606000542640686, Average Training Loss: 1.6289588958024979, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.6206258535385132, Average Training Los

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6616541353383458
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.65      0.60      0.62       132
                Educational Opportunity       0.45      0.55      0.50       138
                         Family Support       0.96      0.95      0.95       133
                      Financial Support       0.67      0.70      0.69       130
                 Program Implementation       0.62      0.52      0.56       132

                               accuracy                           0.66       665
                              macro avg       0.67      0.66      0.66       665
                           weighted avg       0.67      0.66      0.66       665

Test Confusion Matrix:
[[ 79  32   1   7  13]
 [ 20  76   2  22  18]
 [  4   2 126   1   0]
 [ 10  17   2  91  10]
 [  9  41   0  14  68]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted33.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6710749864578247, Average Training Loss: 1.6710749864578247, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.7342960834503174, Average Training Loss: 1.702685534954071, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.6587553024291992, Average Training Loss: 1.688042124112447, Training Accuracy: 0.20833333333333334
Epoch 1/5, Batch Loss: 1.739537000656128, Average Training Loss: 1.7009158432483673, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.6689410209655762, Average Training Loss: 1.6945208787918091, Training Accuracy: 0.175
Epoch 1/5, Batch Loss: 1.7122232913970947, Average Training Loss: 1.69747128089269, Training Accuracy: 0.17708333333333334
Epoch 1/5, Batch Loss: 1.5389128923416138, Average Training Loss: 1.6748200825282507, Training Accuracy: 0.19642857142857142
Epoch 1/5, Batch Loss: 1.6347146034240723, Average Training Loss: 1.6698068976402283, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.4871492385864258, Average Training Loss: 1.64

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/5, Validation Loss: 55.67829167842865, Validation Accuracy: 0.19548872180451127
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       1.00      0.02      0.04       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.19      1.00      0.32       102

                               accuracy                           0.20       532
                              macro avg       0.24      0.20      0.07       532
                           weighted avg       0.24      0.20      0.07       532

Validation Confusion Matrix:
[[  0   0   0   0 112]
 [  0   0   0   0 102]
 [  0   0   2   0 108]
 [  0   0   0   0 106]
 [  0   0   0   0 102]]
E

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7172932330827068
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.69      0.68      0.69       132
                Educational Opportunity       0.54      0.60      0.57       138
                         Family Support       0.94      0.95      0.95       133
                      Financial Support       0.72      0.79      0.75       130
                 Program Implementation       0.73      0.56      0.63       132

                               accuracy                           0.72       665
                              macro avg       0.72      0.72      0.72       665
                           weighted avg       0.72      0.72      0.72       665

Test Confusion Matrix:
[[ 90  18   2   9  13]
 [ 27  83   3  14  11]
 [  1   3 127   2   0]
 [  5  15   3 103   4]
 [  7  35   0  16  74]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted34.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.8684571981430054, Average Training Loss: 1.8684571981430054, Training Accuracy: 0.0625
Epoch 1/5, Batch Loss: 1.4552329778671265, Average Training Loss: 1.661845088005066, Training Accuracy: 0.25
Epoch 1/5, Batch Loss: 1.914751648902893, Average Training Loss: 1.7461472749710083, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.5891960859298706, Average Training Loss: 1.7069094777107239, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.666061282157898, Average Training Loss: 1.6987398386001586, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.8254032135009766, Average Training Loss: 1.7198504010836284, Training Accuracy: 0.16666666666666666
Epoch 1/5, Batch Loss: 1.7441178560256958, Average Training Loss: 1.7233171803610665, Training Accuracy: 0.15178571428571427
Epoch 1/5, Batch Loss: 1.482595682144165, Average Training Loss: 1.6932269930839539, Training Accuracy: 0.171875
Epoch 1/5, Batch Loss: 1.7429105043411255, Average Training Loss: 1.69874738322363

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7368421052631579
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.69      0.81      0.74       132
                Educational Opportunity       0.57      0.57      0.57       138
                         Family Support       0.94      0.95      0.95       133
                      Financial Support       0.76      0.79      0.77       130
                 Program Implementation       0.74      0.56      0.64       132

                               accuracy                           0.74       665
                              macro avg       0.74      0.74      0.74       665
                           weighted avg       0.74      0.74      0.73       665

Test Confusion Matrix:
[[107  12   3   4   6]
 [ 28  79   4  15  12]
 [  1   3 127   2   0]
 [  7  11   1 103   8]
 [ 13  33   0  12  74]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted35.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.7082277536392212, Average Training Loss: 1.7082277536392212, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.592028260231018, Average Training Loss: 1.6501280069351196, Training Accuracy: 0.171875
Epoch 1/5, Batch Loss: 1.6461527347564697, Average Training Loss: 1.648802916208903, Training Accuracy: 0.20833333333333334
Epoch 1/5, Batch Loss: 1.6384172439575195, Average Training Loss: 1.6462064981460571, Training Accuracy: 0.1953125
Epoch 1/5, Batch Loss: 1.627968192100525, Average Training Loss: 1.6425588369369506, Training Accuracy: 0.2
Epoch 1/5, Batch Loss: 1.6257233619689941, Average Training Loss: 1.6397529244422913, Training Accuracy: 0.203125
Epoch 1/5, Batch Loss: 1.6328423023223877, Average Training Loss: 1.6387656927108765, Training Accuracy: 0.20089285714285715
Epoch 1/5, Batch Loss: 1.6334081888198853, Average Training Loss: 1.6380960047245026, Training Accuracy: 0.20703125
Epoch 1/5, Batch Loss: 1.5882896184921265, Average Training Loss: 1.63256

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7218045112781954
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.71      0.71      0.71       132
                Educational Opportunity       0.52      0.59      0.55       138
                         Family Support       0.97      0.96      0.97       133
                      Financial Support       0.70      0.77      0.73       130
                 Program Implementation       0.77      0.58      0.66       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.72      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[ 94  21   2   6   9]
 [ 24  82   1  21  10]
 [  1   3 128   1   0]
 [  6  19   1 100   4]
 [  7  34   0  15  76]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted36.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6381934881210327, Average Training Loss: 1.6381934881210327, Training Accuracy: 0.25
Epoch 1/5, Batch Loss: 1.5734894275665283, Average Training Loss: 1.6058414578437805, Training Accuracy: 0.296875
Epoch 1/5, Batch Loss: 1.5555050373077393, Average Training Loss: 1.5890626509984334, Training Accuracy: 0.28125
Epoch 1/5, Batch Loss: 1.636052131652832, Average Training Loss: 1.600810021162033, Training Accuracy: 0.2421875
Epoch 1/5, Batch Loss: 1.5943045616149902, Average Training Loss: 1.5995089292526246, Training Accuracy: 0.23125
Epoch 1/5, Batch Loss: 1.609272837638855, Average Training Loss: 1.6011362473169963, Training Accuracy: 0.23958333333333334
Epoch 1/5, Batch Loss: 1.6609894037246704, Average Training Loss: 1.6096866982323783, Training Accuracy: 0.24553571428571427
Epoch 1/5, Batch Loss: 1.5101349353790283, Average Training Loss: 1.5972427278757095, Training Accuracy: 0.265625
Epoch 1/5, Batch Loss: 1.5129156112670898, Average Training Loss: 1.587873

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7112781954887218
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.72      0.69      0.70       132
                Educational Opportunity       0.53      0.57      0.55       138
                         Family Support       0.96      0.97      0.97       133
                      Financial Support       0.70      0.79      0.74       130
                 Program Implementation       0.67      0.55      0.60       132

                               accuracy                           0.71       665
                              macro avg       0.71      0.71      0.71       665
                           weighted avg       0.71      0.71      0.71       665

Test Confusion Matrix:
[[ 91  15   2   9  15]
 [ 23  78   3  19  15]
 [  0   1 129   3   0]
 [  5  16   0 103   6]
 [  8  38   0  14  72]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted37.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6765186786651611, Average Training Loss: 1.6765186786651611, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.7484495639801025, Average Training Loss: 1.7124841213226318, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.6749070882797241, Average Training Loss: 1.6999584436416626, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.6137641668319702, Average Training Loss: 1.6784098744392395, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.636732578277588, Average Training Loss: 1.6700744152069091, Training Accuracy: 0.15
Epoch 1/5, Batch Loss: 1.6466845273971558, Average Training Loss: 1.6661761005719502, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.6520148515701294, Average Training Loss: 1.6641530650002616, Training Accuracy: 0.15178571428571427
Epoch 1/5, Batch Loss: 1.6229439973831177, Average Training Loss: 1.6590019315481186, Training Accuracy: 0.1640625
Epoch 1/5, Batch Loss: 1.6282238960266113, Average Training Loss: 1.6555821498235066, Train

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/5, Validation Loss: 51.347862124443054, Validation Accuracy: 0.39097744360902253
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.37      0.54      0.44       112
                Educational Opportunity       0.33      0.30      0.32       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.41      0.36      0.38       106
                 Program Implementation       0.43      0.76      0.55       102

                               accuracy                           0.39       532
                              macro avg       0.31      0.39      0.34       532
                           weighted avg       0.31      0.39      0.34       532

Validation Confusion Matrix:
[[61 18  0  6 27]
 [36 31  0  9 26]
 [34 13  0 38 25]
 [24 20  0 38 24]
 [11 11  0  2 78]]
Epoch 2/5, Batch Loss: 1.

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.724812030075188
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.66      0.75      0.70       132
                Educational Opportunity       0.56      0.54      0.55       138
                         Family Support       0.96      0.97      0.96       133
                      Financial Support       0.71      0.81      0.76       130
                 Program Implementation       0.74      0.56      0.64       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.73      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[ 99  16   2   8   7]
 [ 28  75   3  19  13]
 [  0   2 129   2   0]
 [  9   9   1 105   6]
 [ 13  31   0  14  74]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/alB

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted38.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.5785589218139648, Average Training Loss: 1.5785589218139648, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.6751905679702759, Average Training Loss: 1.6268747448921204, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.7039055824279785, Average Training Loss: 1.6525516907374065, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.7222473621368408, Average Training Loss: 1.669975608587265, Training Accuracy: 0.171875
Epoch 1/5, Batch Loss: 1.5907821655273438, Average Training Loss: 1.6541369199752807, Training Accuracy: 0.175
Epoch 1/5, Batch Loss: 1.5724143981933594, Average Training Loss: 1.640516499678294, Training Accuracy: 0.19791666666666666
Epoch 1/5, Batch Loss: 1.674635648727417, Average Training Loss: 1.6453906638281686, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.575003743171692, Average Training Loss: 1.636592298746109, Training Accuracy: 0.2265625
Epoch 1/5, Batch Loss: 1.6285078525543213, Average Training Loss: 1.6356940269470215, Traini

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/5, Validation Loss: 54.40745198726654, Validation Accuracy: 0.2349624060150376
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.11      0.04      0.05       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.22      0.54      0.31       110
                      Financial Support       0.23      0.44      0.30       106
                 Program Implementation       0.94      0.15      0.25       102

                               accuracy                           0.23       532
                              macro avg       0.30      0.23      0.18       532
                           weighted avg       0.29      0.23      0.18       532

Validation Confusion Matrix:
[[ 4  0 60 48  0]
 [ 6  0 59 37  0]
 [ 4  0 59 47  0]
 [ 3  0 55 47  1]
 [20  0 40 27 15]]
Epoch 2/5, Batch Loss: 1.59

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/5, Validation Loss: 52.48396909236908, Validation Accuracy: 0.31954887218045114
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.39      0.20      0.26       112
                Educational Opportunity       0.20      0.01      0.02       102
                         Family Support       0.24      0.85      0.38       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.64      0.52      0.57       102

                               accuracy                           0.32       532
                              macro avg       0.29      0.32      0.25       532
                           weighted avg       0.29      0.32      0.25       532

Validation Confusion Matrix:
[[22  2 83  0  5]
 [16  1 71  0 14]
 [ 9  2 94  0  5]
 [ 7  0 93  0  6]
 [ 2  0 47  0 53]]
Epoch 3/5, Batch Loss: 1.5

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3/5, Validation Loss: 37.06374579668045, Validation Accuracy: 0.5639097744360902
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.38      0.88      0.53       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.83      0.98      0.90       110
                      Financial Support       0.83      0.28      0.42       106
                 Program Implementation       0.58      0.63      0.60       102

                               accuracy                           0.56       532
                              macro avg       0.52      0.55      0.49       532
                           weighted avg       0.53      0.56      0.50       532

Validation Confusion Matrix:
[[ 98   0   5   2   7]
 [ 74   0  11   2  15]
 [  2   0 108   0   0]
 [ 46   0   5  30  25]
 [ 35   0   1   2  64]]
Ep

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6330827067669172
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.51      0.58      0.55       132
                Educational Opportunity       0.34      0.36      0.35       138
                         Family Support       0.94      0.98      0.96       133
                      Financial Support       0.68      0.80      0.74       130
                 Program Implementation       0.77      0.45      0.57       132

                               accuracy                           0.63       665
                              macro avg       0.65      0.64      0.63       665
                           weighted avg       0.65      0.63      0.63       665

Test Confusion Matrix:
[[ 77  40   2   8   5]
 [ 53  50   5  21   9]
 [  2   0 130   1   0]
 [ 10  11   1 104   4]
 [  8  46   0  18  60]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted39.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.7754950523376465, Average Training Loss: 1.7754950523376465, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.714264154434204, Average Training Loss: 1.7448796033859253, Training Accuracy: 0.21875
Epoch 1/5, Batch Loss: 1.7050178050994873, Average Training Loss: 1.731592337290446, Training Accuracy: 0.20833333333333334
Epoch 1/5, Batch Loss: 1.7199972867965698, Average Training Loss: 1.728693574666977, Training Accuracy: 0.1796875
Epoch 1/5, Batch Loss: 1.6697684526443481, Average Training Loss: 1.7169085502624513, Training Accuracy: 0.1625
Epoch 1/5, Batch Loss: 1.5828262567520142, Average Training Loss: 1.694561501344045, Training Accuracy: 0.171875
Epoch 1/5, Batch Loss: 1.7311385869979858, Average Training Loss: 1.699786799294608, Training Accuracy: 0.16517857142857142
Epoch 1/5, Batch Loss: 1.6162978410720825, Average Training Loss: 1.6893506795167923, Training Accuracy: 0.17578125
Epoch 1/5, Batch Loss: 1.62482750415802, Average Training Loss: 1.682181

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7157894736842105
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.66      0.73      0.69       132
                Educational Opportunity       0.56      0.62      0.59       138
                         Family Support       0.95      0.97      0.96       133
                      Financial Support       0.74      0.74      0.74       130
                 Program Implementation       0.69      0.53      0.60       132

                               accuracy                           0.72       665
                              macro avg       0.72      0.72      0.71       665
                           weighted avg       0.72      0.72      0.71       665

Test Confusion Matrix:
[[ 96  14   2   6  14]
 [ 27  85   4  13   9]
 [  0   1 129   3   0]
 [  9  15   1  96   9]
 [ 14  36   0  12  70]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted40.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6138460636138916, Average Training Loss: 1.6138460636138916, Training Accuracy: 0.21875
Epoch 1/5, Batch Loss: 1.6412383317947388, Average Training Loss: 1.6275421977043152, Training Accuracy: 0.203125
Epoch 1/5, Batch Loss: 1.6547605991363525, Average Training Loss: 1.636614998181661, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.6197344064712524, Average Training Loss: 1.6323948502540588, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.6093746423721313, Average Training Loss: 1.6277908086776733, Training Accuracy: 0.2
Epoch 1/5, Batch Loss: 1.6055514812469482, Average Training Loss: 1.6240842541058857, Training Accuracy: 0.19791666666666666
Epoch 1/5, Batch Loss: 1.5642956495285034, Average Training Loss: 1.6155430248805456, Training Accuracy: 0.21428571428571427
Epoch 1/5, Batch Loss: 1.6364365816116333, Average Training Loss: 1.6181547194719315, Training Accuracy: 0.21875
Epoch 1/5, Batch Loss: 1.6189312934875488, Average Training Loss: 1.6182410054

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/5, Validation Loss: 26.26671063899994, Validation Accuracy: 0.3966165413533835
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       1.00      0.05      0.10       112
                Educational Opportunity       0.33      0.63      0.44       102
                         Family Support       0.40      0.69      0.50       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.46      0.64      0.53       102

                               accuracy                           0.40       532
                              macro avg       0.44      0.40      0.31       532
                           weighted avg       0.44      0.40      0.31       532

Validation Confusion Matrix:
[[ 6 65 25  0 16]
 [ 0 64 21  0 17]
 [ 0 15 76  0 19]
 [ 0 27 54  0 25]
 [ 0 21 16  0 65]]
Epoch 2/5, Batch Loss: 1.47

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7293233082706767
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.70      0.67      0.68       132
                Educational Opportunity       0.58      0.62      0.60       138
                         Family Support       0.94      0.97      0.96       133
                      Financial Support       0.71      0.83      0.76       130
                 Program Implementation       0.74      0.55      0.63       132

                               accuracy                           0.73       665
                              macro avg       0.73      0.73      0.73       665
                           weighted avg       0.73      0.73      0.73       665

Test Confusion Matrix:
[[ 89  19   2  10  12]
 [ 20  86   4  16  12]
 [  0   2 129   2   0]
 [  7  11   2 108   2]
 [ 12  30   0  17  73]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted41.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.6498172283172607, Average Training Loss: 1.6498172283172607, Training Accuracy: 0.375
Epoch 1/8, Batch Loss: 1.6972275972366333, Average Training Loss: 1.673522412776947, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.6050212383270264, Average Training Loss: 1.6506886879603069, Training Accuracy: 0.2708333333333333
Epoch 1/8, Batch Loss: 1.7891594171524048, Average Training Loss: 1.6853063702583313, Training Accuracy: 0.265625
Epoch 1/8, Batch Loss: 1.723007321357727, Average Training Loss: 1.6928465604782104, Training Accuracy: 0.2625
Epoch 1/8, Batch Loss: 1.5777535438537598, Average Training Loss: 1.673664391040802, Training Accuracy: 0.2604166666666667
Epoch 1/8, Batch Loss: 1.6944129467010498, Average Training Loss: 1.6766284704208374, Training Accuracy: 0.24107142857142858
Epoch 1/8, Batch Loss: 1.6117371320724487, Average Training Loss: 1.6685170531272888, Training Accuracy: 0.2421875
Epoch 1/8, Batch Loss: 1.6134229898452759, Average Training Loss: 1.

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6932330827067669
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.67      0.70      0.68       132
                Educational Opportunity       0.49      0.51      0.50       138
                         Family Support       0.95      0.95      0.95       133
                      Financial Support       0.68      0.76      0.72       130
                 Program Implementation       0.68      0.55      0.61       132

                               accuracy                           0.69       665
                              macro avg       0.70      0.69      0.69       665
                           weighted avg       0.69      0.69      0.69       665

Test Confusion Matrix:
[[ 92  21   3   7   9]
 [ 26  70   3  23  16]
 [  2   1 127   3   0]
 [  5  17   0  99   9]
 [ 13  33   0  13  73]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted42.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.5899052619934082, Average Training Loss: 1.5899052619934082, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.6488189697265625, Average Training Loss: 1.6193621158599854, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.5985078811645508, Average Training Loss: 1.6124107042948406, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.8670909404754639, Average Training Loss: 1.6760807633399963, Training Accuracy: 0.15625
Epoch 1/8, Batch Loss: 1.6811845302581787, Average Training Loss: 1.6771015167236327, Training Accuracy: 0.1625
Epoch 1/8, Batch Loss: 1.621835470199585, Average Training Loss: 1.6678905089696248, Training Accuracy: 0.17708333333333334
Epoch 1/8, Batch Loss: 1.6795367002487183, Average Training Loss: 1.669554250580924, Training Accuracy: 0.16964285714285715
Epoch 1/8, Batch Loss: 1.681722640991211, Average Training Loss: 1.6710752993822098, Training Accuracy: 0.1640625
Epoch 1/8, Batch Loss: 1.6173522472381592, Average Training Loss: 1.66510607136

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7172932330827068
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.71      0.70      0.70       132
                Educational Opportunity       0.53      0.57      0.55       138
                         Family Support       0.95      0.94      0.95       133
                      Financial Support       0.73      0.78      0.76       130
                 Program Implementation       0.68      0.61      0.64       132

                               accuracy                           0.72       665
                              macro avg       0.72      0.72      0.72       665
                           weighted avg       0.72      0.72      0.72       665

Test Confusion Matrix:
[[ 92  20   2   7  11]
 [ 23  78   4  16  17]
 [  1   3 125   3   1]
 [  6  14   0 102   8]
 [  8  32   0  12  80]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted43.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.756250023841858, Average Training Loss: 1.756250023841858, Training Accuracy: 0.125
Epoch 1/8, Batch Loss: 1.6857091188430786, Average Training Loss: 1.7209795713424683, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.687939167022705, Average Training Loss: 1.7099661032358806, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.5705070495605469, Average Training Loss: 1.6751013398170471, Training Accuracy: 0.234375
Epoch 1/8, Batch Loss: 1.6458609104156494, Average Training Loss: 1.6692532539367675, Training Accuracy: 0.225
Epoch 1/8, Batch Loss: 1.6246293783187866, Average Training Loss: 1.6618159413337708, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.6448630094528198, Average Training Loss: 1.6593940939222063, Training Accuracy: 0.20982142857142858
Epoch 1/8, Batch Loss: 1.5384018421173096, Average Training Loss: 1.6442700624465942, Training Accuracy: 0.22265625
Epoch 1/8, Batch Loss: 1.593916893005371, Average Training Loss: 1.6386752658420138, Trai

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6842105263157895
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.61      0.70      0.65       132
                Educational Opportunity       0.52      0.53      0.53       138
                         Family Support       0.96      0.94      0.95       133
                      Financial Support       0.70      0.77      0.73       130
                 Program Implementation       0.64      0.48      0.55       132

                               accuracy                           0.68       665
                              macro avg       0.69      0.69      0.68       665
                           weighted avg       0.69      0.68      0.68       665

Test Confusion Matrix:
[[ 93  16   1   9  13]
 [ 29  73   3  18  15]
 [  1   2 125   3   2]
 [ 10  13   1 100   6]
 [ 19  36   0  13  64]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted44.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.6258224248886108, Average Training Loss: 1.6258224248886108, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.5975301265716553, Average Training Loss: 1.611676275730133, Training Accuracy: 0.171875
Epoch 1/8, Batch Loss: 1.654746651649475, Average Training Loss: 1.626033067703247, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.637841820716858, Average Training Loss: 1.6289852559566498, Training Accuracy: 0.1953125
Epoch 1/8, Batch Loss: 1.6079092025756836, Average Training Loss: 1.6247700452804565, Training Accuracy: 0.2
Epoch 1/8, Batch Loss: 1.586766242980957, Average Training Loss: 1.61843607823054, Training Accuracy: 0.22395833333333334
Epoch 1/8, Batch Loss: 1.6817913055419922, Average Training Loss: 1.6274868249893188, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.6493905782699585, Average Training Loss: 1.6302247941493988, Training Accuracy: 0.21484375
Epoch 1/8, Batch Loss: 1.706445336341858, Average Training Loss: 1.6386937432818942, Traini

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7172932330827068
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.66      0.73      0.69       132
                Educational Opportunity       0.54      0.56      0.55       138
                         Family Support       0.94      0.98      0.96       133
                      Financial Support       0.73      0.82      0.77       130
                 Program Implementation       0.74      0.51      0.60       132

                               accuracy                           0.72       665
                              macro avg       0.72      0.72      0.71       665
                           weighted avg       0.72      0.72      0.71       665

Test Confusion Matrix:
[[ 96  20   3   6   7]
 [ 29  77   5  15  12]
 [  0   1 130   2   0]
 [  7  11   0 107   5]
 [ 14  34   0  17  67]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted45.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.6410489082336426, Average Training Loss: 1.6410489082336426, Training Accuracy: 0.375
Epoch 1/8, Batch Loss: 1.7050055265426636, Average Training Loss: 1.673027217388153, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.9113744497299194, Average Training Loss: 1.7524762948354085, Training Accuracy: 0.20833333333333334
Epoch 1/8, Batch Loss: 1.6931915283203125, Average Training Loss: 1.7376551032066345, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.7283766269683838, Average Training Loss: 1.7357994079589845, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6749953031539917, Average Training Loss: 1.7256653904914856, Training Accuracy: 0.19791666666666666
Epoch 1/8, Batch Loss: 1.6190986633300781, Average Training Loss: 1.7104415723255701, Training Accuracy: 0.17857142857142858
Epoch 1/8, Batch Loss: 1.6436477899551392, Average Training Loss: 1.7020923495292664, Training Accuracy: 0.1796875
Epoch 1/8, Batch Loss: 1.6071157455444336, Average Training Loss: 

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.718796992481203
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.67      0.74      0.71       132
                Educational Opportunity       0.57      0.53      0.55       138
                         Family Support       0.94      0.95      0.95       133
                      Financial Support       0.73      0.79      0.76       130
                 Program Implementation       0.68      0.58      0.63       132

                               accuracy                           0.72       665
                              macro avg       0.72      0.72      0.72       665
                           weighted avg       0.72      0.72      0.72       665

Test Confusion Matrix:
[[ 98  15   2   5  12]
 [ 29  73   4  15  17]
 [  0   3 127   3   0]
 [  6  12   2 103   7]
 [ 13  26   0  16  77]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/alB

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted46.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.6036946773529053, Average Training Loss: 1.6036946773529053, Training Accuracy: 0.3125
Epoch 1/8, Batch Loss: 1.6077693700790405, Average Training Loss: 1.605732023715973, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.7744672298431396, Average Training Loss: 1.6619770924250286, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.6414337158203125, Average Training Loss: 1.6568412482738495, Training Accuracy: 0.234375
Epoch 1/8, Batch Loss: 1.5180596113204956, Average Training Loss: 1.6290849208831788, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.6121501922607422, Average Training Loss: 1.6262624661127727, Training Accuracy: 0.22916666666666666
Epoch 1/8, Batch Loss: 1.726491093635559, Average Training Loss: 1.6405808414731706, Training Accuracy: 0.21428571428571427
Epoch 1/8, Batch Loss: 1.6224727630615234, Average Training Loss: 1.6383173316717148, Training Accuracy: 0.2265625
Epoch 1/8, Batch Loss: 1.6235967874526978, Average Training Loss: 1.6366817156473

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7293233082706767
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.68      0.77      0.72       132
                Educational Opportunity       0.59      0.57      0.58       138
                         Family Support       0.95      0.95      0.95       133
                      Financial Support       0.71      0.81      0.76       130
                 Program Implementation       0.72      0.55      0.63       132

                               accuracy                           0.73       665
                              macro avg       0.73      0.73      0.73       665
                           weighted avg       0.73      0.73      0.73       665

Test Confusion Matrix:
[[102  14   2   6   8]
 [ 25  78   4  18  13]
 [  1   2 127   3   0]
 [  8  10   0 105   7]
 [ 14  29   0  16  73]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted47.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.661444902420044, Average Training Loss: 1.661444902420044, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.6914474964141846, Average Training Loss: 1.6764461994171143, Training Accuracy: 0.171875
Epoch 1/8, Batch Loss: 1.7250255346298218, Average Training Loss: 1.6926393111546834, Training Accuracy: 0.14583333333333334
Epoch 1/8, Batch Loss: 1.6362814903259277, Average Training Loss: 1.6785498559474945, Training Accuracy: 0.1640625
Epoch 1/8, Batch Loss: 1.6250193119049072, Average Training Loss: 1.667843747138977, Training Accuracy: 0.18125
Epoch 1/8, Batch Loss: 1.6163417100906372, Average Training Loss: 1.6592600742975872, Training Accuracy: 0.19270833333333334
Epoch 1/8, Batch Loss: 1.6457029581069946, Average Training Loss: 1.6573233434132166, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6261281967163086, Average Training Loss: 1.6534239500761032, Training Accuracy: 0.19140625
Epoch 1/8, Batch Loss: 1.7096487283706665, Average Training Loss: 1.65

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7037593984962406
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.66      0.73      0.70       132
                Educational Opportunity       0.52      0.54      0.53       138
                         Family Support       0.96      0.97      0.97       133
                      Financial Support       0.67      0.78      0.72       130
                 Program Implementation       0.72      0.51      0.60       132

                               accuracy                           0.70       665
                              macro avg       0.71      0.71      0.70       665
                           weighted avg       0.71      0.70      0.70       665

Test Confusion Matrix:
[[ 97  18   2   6   9]
 [ 32  74   3  20   9]
 [  0   1 129   3   0]
 [  4  17   0 101   8]
 [ 13  31   0  21  67]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted48.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.622567892074585, Average Training Loss: 1.622567892074585, Training Accuracy: 0.28125
Epoch 1/8, Batch Loss: 1.770319938659668, Average Training Loss: 1.6964439153671265, Training Accuracy: 0.234375
Epoch 1/8, Batch Loss: 1.5936315059661865, Average Training Loss: 1.6621731122334797, Training Accuracy: 0.23958333333333334
Epoch 1/8, Batch Loss: 1.6552592515945435, Average Training Loss: 1.6604446470737457, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.6001498699188232, Average Training Loss: 1.6483856916427613, Training Accuracy: 0.225
Epoch 1/8, Batch Loss: 1.6548104286193848, Average Training Loss: 1.6494564811388652, Training Accuracy: 0.234375
Epoch 1/8, Batch Loss: 1.5503194332122803, Average Training Loss: 1.6352940457207816, Training Accuracy: 0.24107142857142858
Epoch 1/8, Batch Loss: 1.5334548950195312, Average Training Loss: 1.6225641518831253, Training Accuracy: 0.2421875
Epoch 1/8, Batch Loss: 1.6119989156723022, Average Training Loss: 1.62139

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7233082706766917
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.72      0.70      0.71       132
                Educational Opportunity       0.55      0.58      0.56       138
                         Family Support       0.98      0.97      0.97       133
                      Financial Support       0.75      0.77      0.76       130
                 Program Implementation       0.64      0.60      0.62       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.72      0.72       665
                           weighted avg       0.72      0.72      0.72       665

Test Confusion Matrix:
[[ 93  16   2   7  14]
 [ 26  80   1  13  18]
 [  0   1 129   3   0]
 [  3  15   0 100  12]
 [  8  34   0  11  79]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted49.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.65508234500885, Average Training Loss: 1.65508234500885, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.5122456550598145, Average Training Loss: 1.5836640000343323, Training Accuracy: 0.3125
Epoch 1/8, Batch Loss: 1.6456646919250488, Average Training Loss: 1.6043308973312378, Training Accuracy: 0.2708333333333333
Epoch 1/8, Batch Loss: 1.6343234777450562, Average Training Loss: 1.6118290424346924, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.569536805152893, Average Training Loss: 1.6033705949783326, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.5638395547866821, Average Training Loss: 1.5967820882797241, Training Accuracy: 0.2604166666666667
Epoch 1/8, Batch Loss: 1.4001352787017822, Average Training Loss: 1.5686896869114466, Training Accuracy: 0.2857142857142857
Epoch 1/8, Batch Loss: 1.5133094787597656, Average Training Loss: 1.5617671608924866, Training Accuracy: 0.3046875
Epoch 1/8, Batch Loss: 1.7105175256729126, Average Training Loss: 1.578294

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/8, Validation Loss: 54.09340798854828, Validation Accuracy: 0.25
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.22      1.00      0.36       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.91      0.21      0.34       102

                               accuracy                           0.25       532
                              macro avg       0.23      0.24      0.14       532
                           weighted avg       0.22      0.25      0.14       532

Validation Confusion Matrix:
[[112   0   0   0   0]
 [101   0   0   0   1]
 [110   0   0   0   0]
 [105   0   0   0   1]
 [ 81   0   0   0  21]]
Epoch 2/8, Batch

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.706766917293233
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.64      0.74      0.69       132
                Educational Opportunity       0.53      0.54      0.54       138
                         Family Support       0.96      0.95      0.95       133
                      Financial Support       0.72      0.82      0.77       130
                 Program Implementation       0.70      0.48      0.57       132

                               accuracy                           0.71       665
                              macro avg       0.71      0.71      0.70       665
                           weighted avg       0.71      0.71      0.70       665

Test Confusion Matrix:
[[ 98  14   2   8  10]
 [ 30  75   3  17  13]
 [  1   2 126   3   1]
 [  9  11   0 107   3]
 [ 15  39   0  14  64]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/alB

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted50.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.7258411645889282, Average Training Loss: 1.7258411645889282, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.7019102573394775, Average Training Loss: 1.7138757109642029, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.5202211141586304, Average Training Loss: 1.6493241786956787, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6324249505996704, Average Training Loss: 1.6450993716716766, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6736270189285278, Average Training Loss: 1.6508049011230468, Training Accuracy: 0.175
Epoch 1/8, Batch Loss: 1.6879404783248901, Average Training Loss: 1.6569941639900208, Training Accuracy: 0.15625
Epoch 1/8, Batch Loss: 1.6586343050003052, Average Training Loss: 1.6572284698486328, Training Accuracy: 0.17857142857142858
Epoch 1/8, Batch Loss: 1.7140828371047974, Average Training Loss: 1.6643352657556534, Training Accuracy: 0.1796875
Epoch 1/8, Batch Loss: 1.6253008842468262, Average Training Loss: 1.6599981122546725, Trai

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/8, Validation Loss: 33.53721669316292, Validation Accuracy: 0.5977443609022557
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.33      0.52      0.40       102
                         Family Support       0.83      0.99      0.90       110
                      Financial Support       0.61      0.89      0.72       106
                 Program Implementation       0.74      0.61      0.67       102

                               accuracy                           0.60       532
                              macro avg       0.50      0.60      0.54       532
                           weighted avg       0.50      0.60      0.53       532

Validation Confusion Matrix:
[[  0  73   6  23  10]
 [  0  53  11  27  11]
 [  0   0 109   1   0]
 [  0   6   5  94   1]
 [  0  29   1  10  62]]
Ep

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7022556390977444
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.68      0.73      0.70       132
                Educational Opportunity       0.52      0.57      0.54       138
                         Family Support       0.95      0.93      0.94       133
                      Financial Support       0.70      0.78      0.73       130
                 Program Implementation       0.70      0.52      0.59       132

                               accuracy                           0.70       665
                              macro avg       0.71      0.70      0.70       665
                           weighted avg       0.71      0.70      0.70       665

Test Confusion Matrix:
[[ 96  18   2   7   9]
 [ 24  78   4  19  13]
 [  1   2 124   5   1]
 [  7  16   0 101   6]
 [ 14  37   0  13  68]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted51.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.7471580505371094, Average Training Loss: 1.7471580505371094, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.651208758354187, Average Training Loss: 1.6991834044456482, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.5962655544281006, Average Training Loss: 1.664877454439799, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.8015120029449463, Average Training Loss: 1.6990360915660858, Training Accuracy: 0.171875
Epoch 1/8, Batch Loss: 1.5964105129241943, Average Training Loss: 1.6785109758377075, Training Accuracy: 0.2
Epoch 1/8, Batch Loss: 1.5938984155654907, Average Training Loss: 1.6644088824590046, Training Accuracy: 0.19791666666666666
Epoch 1/8, Batch Loss: 1.5926340818405151, Average Training Loss: 1.6541553395135062, Training Accuracy: 0.19196428571428573
Epoch 1/8, Batch Loss: 1.5509848594665527, Average Training Loss: 1.641259029507637, Training Accuracy: 0.19140625
Epoch 1/8, Batch Loss: 1.5457998514175415, Average Training Loss: 1.630652454

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7022556390977444
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.66      0.75      0.70       132
                Educational Opportunity       0.52      0.50      0.51       138
                         Family Support       0.96      0.97      0.96       133
                      Financial Support       0.70      0.77      0.73       130
                 Program Implementation       0.67      0.53      0.59       132

                               accuracy                           0.70       665
                              macro avg       0.70      0.70      0.70       665
                           weighted avg       0.70      0.70      0.70       665

Test Confusion Matrix:
[[ 99  16   2   6   9]
 [ 32  69   2  19  16]
 [  2   1 129   1   0]
 [  7  12   2 100   9]
 [ 11  34   0  17  70]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/al

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/alBERTPredicted52.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.7619667053222656, Average Training Loss: 1.7619667053222656, Training Accuracy: 0.09375
Epoch 1/8, Batch Loss: 1.8067911863327026, Average Training Loss: 1.7843789458274841, Training Accuracy: 0.125
Epoch 1/8, Batch Loss: 1.6514993906021118, Average Training Loss: 1.74008576075236, Training Accuracy: 0.13541666666666666
Epoch 1/8, Batch Loss: 1.6138614416122437, Average Training Loss: 1.708529680967331, Training Accuracy: 0.1640625
Epoch 1/8, Batch Loss: 1.6232749223709106, Average Training Loss: 1.6914787292480469, Training Accuracy: 0.175
Epoch 1/8, Batch Loss: 1.4736251831054688, Average Training Loss: 1.6551698048909504, Training Accuracy: 0.203125
Epoch 1/8, Batch Loss: 1.5571725368499756, Average Training Loss: 1.6411701951708113, Training Accuracy: 0.20535714285714285
Epoch 1/8, Batch Loss: 1.6022857427597046, Average Training Loss: 1.636309638619423, Training Accuracy: 0.2109375
Epoch 1/8, Batch Loss: 1.483559012413025, Average Training Loss: 1.61933734

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/8, Validation Loss: 27.38827073574066, Validation Accuracy: 0.21052631578947367
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.21      1.00      0.35       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.00      0.00      0.00       102

                               accuracy                           0.21       532
                              macro avg       0.04      0.20      0.07       532
                           weighted avg       0.04      0.21      0.07       532

Validation Confusion Matrix:
[[112   0   0   0   0]
 [102   0   0   0   0]
 [110   0   0   0   0]
 [106   0   0   0   0]
 [102   0   0   0   0]]
E

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/8, Validation Loss: 27.668303608894348, Validation Accuracy: 0.19172932330827067
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.19      1.00      0.32       102

                               accuracy                           0.19       532
                              macro avg       0.04      0.20      0.06       532
                           weighted avg       0.04      0.19      0.06       532

Validation Confusion Matrix:
[[  0   0   0   0 112]
 [  0   0   0   0 102]
 [  0   0   0   0 110]
 [  0   0   0   0 106]
 [  0   0   0   0 102]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3/8, Validation Loss: 27.39461660385132, Validation Accuracy: 0.19172932330827067
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.19      1.00      0.32       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.00      0.00      0.00       102

                               accuracy                           0.19       532
                              macro avg       0.04      0.20      0.06       532
                           weighted avg       0.04      0.19      0.06       532

Validation Confusion Matrix:
[[  0 112   0   0   0]
 [  0 102   0   0   0]
 [  0 110   0   0   0]
 [  0 106   0   0   0]
 [  0 102   0   0   0]]
E

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 4/8, Validation Loss: 26.964696645736694, Validation Accuracy: 0.2236842105263158
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.24      0.78      0.37       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.18      0.30      0.23       106
                 Program Implementation       0.00      0.00      0.00       102

                               accuracy                           0.22       532
                              macro avg       0.09      0.22      0.12       532
                           weighted avg       0.09      0.22      0.12       532

Validation Confusion Matrix:
[[87  0  0 25  0]
 [75  0  0 27  0]
 [88  0  0 22  0]
 [74  0  0 32  0]
 [32  0  0 70  0]]
Epoch 5/8, Batch Loss: 1.6

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 5/8, Validation Loss: 26.719431042671204, Validation Accuracy: 0.2706766917293233
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.23      0.90      0.36       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.45      0.44      0.45       102

                               accuracy                           0.27       532
                              macro avg       0.14      0.27      0.16       532
                           weighted avg       0.13      0.27      0.16       532

Validation Confusion Matrix:
[[ 0  0 99  0 13]
 [ 0  0 89  0 13]
 [ 0  0 99  0 11]
 [ 0  0 89  0 17]
 [ 0  0 57  0 45]]
Epoch 6/8, Batch Loss: 1.5

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 6/8, Validation Loss: 26.697901606559753, Validation Accuracy: 0.2387218045112782
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.22      1.00      0.36       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       1.00      0.15      0.26       102

                               accuracy                           0.24       532
                              macro avg       0.24      0.23      0.12       532
                           weighted avg       0.24      0.24      0.12       532

Validation Confusion Matrix:
[[112   0   0   0   0]
 [102   0   0   0   0]
 [110   0   0   0   0]
 [106   0   0   0   0]
 [ 87   0   0   0  15]]
E

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 7/8, Validation Loss: 26.541950821876526, Validation Accuracy: 0.25
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.00      0.00      0.00       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.20      0.86      0.33       106
                 Program Implementation       0.51      0.41      0.45       102

                               accuracy                           0.25       532
                              macro avg       0.14      0.25      0.16       532
                           weighted avg       0.14      0.25      0.15       532

Validation Confusion Matrix:
[[  0   0   0 105   7]
 [  0   0   0  91  11]
 [  0   0   0 102   8]
 [  0   0   0  91  15]
 [  0   0   0  60  42]]
Epoch 8/8, Batc

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  text = BeautifulSoup(text, 'html.parser').get_text()


Epoch 8/8, Validation Loss: 26.223894000053406, Validation Accuracy: 0.25375939849624063
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.00      0.00      0.00       112
                Educational Opportunity       0.20      0.92      0.33       102
                         Family Support       0.00      0.00      0.00       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.63      0.40      0.49       102

                               accuracy                           0.25       532
                              macro avg       0.17      0.26      0.16       532
                           weighted avg       0.16      0.25      0.16       532

Validation Confusion Matrix:
[[  0 107   0   0   5]
 [  0  94   0   1   7]
 [  0 105   0   0   5]
 [  0  99   0   0   7]
 [  0  61   0   0  41]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
