<a href="https://colab.research.google.com/github/ChiccoSy/BERT_Based_Multiclass_Text_Classification/blob/main/BERT_Multi_CLass_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers
!pip install torch
!pip install scikit-learn
!pip install nltk
!pip install beautifulsoup4
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_dataset.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4                                make education easier
5    experience free funded material module make st...
6    great opportunity student study university wit...
7                                        helpful study
8      program made possible continue studying college
9      scholarship serve stepping stone onward success
Name: Processed_Response, dtype: object


In [None]:
# Experiment1
# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6464433670043945, Average Training Loss: 1.6464433670043945, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6103219985961914, Average Training Loss: 1.628382682800293, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6604269742965698, Average Training Loss: 1.639064113299052, Training Accuracy: 0.20833333333333334
Epoch 1/1, Batch Loss: 1.681720495223999, Average Training Loss: 1.6497282087802887, Training Accuracy: 0.1796875
Epoch 1/1, Batch Loss: 1.6180360317230225, Average Training Loss: 1.6433897733688354, Training Accuracy: 0.16875
Epoch 1/1, Batch Loss: 1.6258275508880615, Average Training Loss: 1.6404627362887065, Training Accuracy: 0.16666666666666666
Epoch 1/1, Batch Loss: 1.6142746210098267, Average Training Loss: 1.6367215769631522, Training Accuracy: 0.17857142857142858
Epoch 1/1, Batch Loss: 1.621942162513733, Average Training Loss: 1.6348741501569748, Training Accuracy: 0.17578125
Epoch 1/1, Batch Loss: 1.569898009300232, Average Training Lo

In [None]:
# Experiment 2 - LR = 1e-5, batch size 64, max-length 128
# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.5899323225021362, Average Training Loss: 1.5899323225021362, Training Accuracy: 0.171875
Epoch 1/1, Batch Loss: 1.6367987394332886, Average Training Loss: 1.6133655309677124, Training Accuracy: 0.1640625
Epoch 1/1, Batch Loss: 1.6447728872299194, Average Training Loss: 1.6238346497217815, Training Accuracy: 0.16145833333333334
Epoch 1/1, Batch Loss: 1.6239815950393677, Average Training Loss: 1.623871386051178, Training Accuracy: 0.171875
Epoch 1/1, Batch Loss: 1.6226438283920288, Average Training Loss: 1.6236258745193481, Training Accuracy: 0.178125
Epoch 1/1, Batch Loss: 1.6658762693405151, Average Training Loss: 1.6306676069895427, Training Accuracy: 0.17447916666666666
Epoch 1/1, Batch Loss: 1.5933986902236938, Average Training Loss: 1.6253434760229928, Training Accuracy: 0.17857142857142858
Epoch 1/1, Batch Loss: 1.616764783859253, Average Training Loss: 1.6242711395025253, Training Accuracy: 0.181640625
Epoch 1/1, Batch Loss: 1.5689927339553833, Average Tr

In [None]:
# Evaluation
model.eval()
val_loss = 0
predictions, true_labels = [], []

for batch in val_dataloader:
    inputs, masks, labels = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks, labels=labels)
    loss = outputs.loss
    logits = outputs.logits

    val_loss += loss.item()

    preds = np.argmax(logits.detach().numpy(), axis=1)
    predictions.extend(preds)
    true_labels.extend(labels.numpy())

# Calculate validation accuracy and other metrics
val_accuracy = accuracy_score(true_labels, predictions)
val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

print(f'Validation Accuracy: {val_accuracy}')
print('Classification Report:')
print(val_classification_report)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Test_Data.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted_Label'] = label_encoder.inverse_transform(test_predictions)

# Print the predicted labels
print(test_df[['Responses', 'Predicted_Label']])

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/predicted.csv"
test_df.to_csv(predicted_csv_path, index=False)  # Use test_df instead of additional_test_df

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')

Validation Accuracy: 0.3793103448275862
Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.42      0.62      0.50        99
                Educational Opportunity       0.29      0.32      0.30        87
                         Family Support       0.35      0.56      0.43        93
                      Financial Support       0.35      0.21      0.27        89
                 Program Implementation       0.84      0.17      0.28        96

                               accuracy                           0.38       464
                              macro avg       0.45      0.38      0.36       464
                           weighted avg       0.46      0.38      0.36       464



  text = BeautifulSoup(text, 'html.parser').get_text()


                                             Responses  \
0    "We were able to save more money because of th...   
1              "It gives access to quality education."   
2    "It was honestly a big help since the budget f...   
3    "The UAQTE gave students the opportunity to en...   
4    "It has given my college experience a good one...   
..                                                 ...   
596  "The UAQTE provides free access to education w...   
597  "Despite being free, quality education was sti...   
598  "I also like the fact that the selection proce...   
599  "The UAQTE Act made a significant difference i...   
600  "It opened doors to higher education that were...   

                             Predicted_Label  
0                             Family Support  
1    Academic Focus and Personal Development  
2                    Educational Opportunity  
3                    Educational Opportunity  
4                    Educational Opportunity  
..                   

In [None]:
# Experiment 1
# Import libraries with Training, and Validation Accuracy
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_dataset.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4                                make education easier
5    experience free funded material module make st...
6    great opportunity student study university wit...
7                                        helpful study
8      program made possible continue studying college
9      scholarship serve stepping stone onward success
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6376235485076904, Average Training Loss: 1.6376235485076904, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.6203571557998657, Average Training Loss: 1.628990352153778, Training Accuracy: 0.1796875
Epoch 1/1, Batch Loss: 1.6203199625015259, Average Training Loss: 1.626100222269694, Training Accuracy: 0.19270833333333334
Epoch 1/1, Batch Loss: 1.6546217203140259, Average Training Loss: 1.633230596780777, Training Accuracy: 0.19140625
Epoch 1/1, Batch Loss: 1.5622313022613525, Average Training Loss: 1.619030737876892, Training Accuracy: 0.225
Epoch 1/1, Batch Loss: 1.5757566690444946, Average Training Loss: 1.6118183930714924, Training Accuracy: 0.24479166666666666
Epoch 1/1, Batch Loss: 1.6062955856323242, Average Training Loss: 1.6110294205801827, Training Accuracy: 0.24553571428571427
Epoch 1/1, Batch Loss: 1.5952094793319702, Average Training Loss: 1.6090519279241562, Training Accuracy: 0.25390625
Epoch 1/1, Batch Loss: 1.5746415853500366, Average Traini

In [None]:
# Experiment 2
# Import libraries with Training, and Validation Accuracy
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_dataset.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4                                make education easier
5    experience free funded material module make st...
6    great opportunity student study university wit...
7                                        helpful study
8      program made possible continue studying college
9      scholarship serve stepping stone onward success
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.5988373756408691, Average Training Loss: 1.5988373756408691, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.655955195426941, Average Training Loss: 1.627396285533905, Training Accuracy: 0.1953125
Epoch 1/1, Batch Loss: 1.6319937705993652, Average Training Loss: 1.628928780555725, Training Accuracy: 0.19270833333333334
Epoch 1/1, Batch Loss: 1.635204553604126, Average Training Loss: 1.6304977238178253, Training Accuracy: 0.19140625
Epoch 1/1, Batch Loss: 1.6025662422180176, Average Training Loss: 1.6249114274978638, Training Accuracy: 0.196875
Epoch 1/1, Batch Loss: 1.5536514520645142, Average Training Loss: 1.613034764925639, Training Accuracy: 0.20833333333333334
Epoch 1/1, Batch Loss: 1.546419620513916, Average Training Loss: 1.603518315723964, Training Accuracy: 0.22321428571428573
Epoch 1/1, Batch Loss: 1.5574125051498413, Average Training Loss: 1.5977550894021988, Training Accuracy: 0.236328125
Epoch 1/1, Batch Loss: 1.552504301071167, Average Trainin

In [None]:
# Experiment 3
# Import libraries with Training, and Validation Accuracy
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_dataset.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-4, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4                                make education easier
5    experience free funded material module make st...
6    great opportunity student study university wit...
7                                        helpful study
8      program made possible continue studying college
9      scholarship serve stepping stone onward success
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.614174246788025, Average Training Loss: 1.614174246788025, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.7336703538894653, Average Training Loss: 1.6739223003387451, Training Accuracy: 0.1796875
Epoch 1/1, Batch Loss: 1.6334284543991089, Average Training Loss: 1.6604243516921997, Training Accuracy: 0.23958333333333334
Epoch 1/1, Batch Loss: 1.6856367588043213, Average Training Loss: 1.66672745347023, Training Accuracy: 0.23046875
Epoch 1/1, Batch Loss: 1.620737075805664, Average Training Loss: 1.6575293779373168, Training Accuracy: 0.225
Epoch 1/1, Batch Loss: 1.62464439868927, Average Training Loss: 1.6520485480626423, Training Accuracy: 0.22135416666666666
Epoch 1/1, Batch Loss: 1.5947799682617188, Average Training Loss: 1.6438673223767961, Training Accuracy: 0.234375
Epoch 1/1, Batch Loss: 1.6171149015426636, Average Training Loss: 1.6405232697725296, Training Accuracy: 0.234375
Epoch 1/1, Batch Loss: 1.6198639869689941, Average Training Loss: 1.638227

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Experiment 4
# Import libraries with Training, and Validation Accuracy
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_dataset.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4                                make education easier
5    experience free funded material module make st...
6    great opportunity student study university wit...
7                                        helpful study
8      program made possible continue studying college
9      scholarship serve stepping stone onward success
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.69524085521698, Average Training Loss: 1.69524085521698, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.6592140197753906, Average Training Loss: 1.6772274374961853, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.5950652360916138, Average Training Loss: 1.6498400370279949, Training Accuracy: 0.16666666666666666
Epoch 1/1, Batch Loss: 1.589835524559021, Average Training Loss: 1.6348389089107513, Training Accuracy: 0.1640625
Epoch 1/1, Batch Loss: 1.5901908874511719, Average Training Loss: 1.6259093046188355, Training Accuracy: 0.18125
Epoch 1/1, Batch Loss: 1.610303521156311, Average Training Loss: 1.6233083407084148, Training Accuracy: 0.18229166666666666
Epoch 1/1, Batch Loss: 1.6572777032852173, Average Training Loss: 1.628161106790815, Training Accuracy: 0.17857142857142858
Epoch 1/1, Batch Loss: 1.5868165493011475, Average Training Loss: 1.6229930371046066, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6302295923233032, Average Training Loss: 

In [None]:
# Experiment 5
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_dataset.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

# ... (your subsequent code)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4                                make education easier
5    experience free funded material module make st...
6    great opportunity student study university wit...
7                                        helpful study
8      program made possible continue studying college
9      scholarship serve stepping stone onward success
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6628273725509644, Average Training Loss: 1.6628273725509644, Training Accuracy: 0.28125
Epoch 1/5, Batch Loss: 1.617159366607666, Average Training Loss: 1.6399933695793152, Training Accuracy: 0.21875
Epoch 1/5, Batch Loss: 1.5728716850280762, Average Training Loss: 1.617619474728902, Training Accuracy: 0.2604166666666667
Epoch 1/5, Batch Loss: 1.6838949918746948, Average Training Loss: 1.6341883540153503, Training Accuracy: 0.21875
Epoch 1/5, Batch Loss: 1.570866346359253, Average Training Loss: 1.621523952484131, Training Accuracy: 0.225
Epoch 1/5, Batch Loss: 1.6272401809692383, Average Training Loss: 1.6224766572316487, Training Accuracy: 0.234375
Epoch 1/5, Batch Loss: 1.642409324645996, Average Training Loss: 1.6253241811479842, Training Accuracy: 0.23660714285714285
Epoch 1/5, Batch Loss: 1.5297448635101318, Average Training Loss: 1.6133767664432526, Training Accuracy: 0.25390625
Epoch 1/5, Batch Loss: 1.5774245262145996, Average Training Loss: 1.60938207

In [None]:
# Experiment 6 - Updated Dataset
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

# ... (your subsequent code)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4    experience free funded material module make st...
5      program made possible continue studying college
6      scholarship serve stepping stone onward success
7    help finish study without worrying tuition als...
8    need worry financial expense allowance tuition...
9    one beneficiary made lot enthusiastic came cla...
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.5823429822921753, Average Training Loss: 1.5823429822921753, Training Accuracy: 0.34375
Epoch 1/1, Batch Loss: 1.653638243675232, Average Training Loss: 1.6179906129837036, Training Accuracy: 0.234375
Epoch 1/1, Batch Loss: 1.654285192489624, Average Training Loss: 1.6300888061523438, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.6669721603393555, Average Training Loss: 1.6393096446990967, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.7101249694824219, Average Training Loss: 1.6534727096557618, Training Accuracy: 0.19375
Epoch 1/1, Batch Loss: 1.5623457431793213, Average Training Loss: 1.6382848819096882, Training Accuracy: 0.19791666666666666
Epoch 1/1, Batch Loss: 1.553162693977356, Average Training Loss: 1.6261245693479265, Training Accuracy: 0.22321428571428573
Epoch 1/1, Batch Loss: 1.638692855834961, Average Training Loss: 1.6276956051588058, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.609021782875061, Average Training Loss: 1.6256207

In [None]:
# Experiment 6 - Updated Dataset
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

# ... (your subsequent code)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4    experience free funded material module make st...
5      program made possible continue studying college
6      scholarship serve stepping stone onward success
7    help finish study without worrying tuition als...
8    need worry financial expense allowance tuition...
9    one beneficiary made lot enthusiastic came cla...
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.7262786626815796, Average Training Loss: 1.7262786626815796, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.6001033782958984, Average Training Loss: 1.663191020488739, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.6564273834228516, Average Training Loss: 1.6609364748001099, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.619482398033142, Average Training Loss: 1.650572955608368, Training Accuracy: 0.203125
Epoch 1/5, Batch Loss: 1.5746512413024902, Average Training Loss: 1.6353886127471924, Training Accuracy: 0.23125
Epoch 1/5, Batch Loss: 1.573506474494934, Average Training Loss: 1.6250749230384827, Training Accuracy: 0.2552083333333333
Epoch 1/5, Batch Loss: 1.6406912803649902, Average Training Loss: 1.6273058312279838, Training Accuracy: 0.23660714285714285
Epoch 1/5, Batch Loss: 1.568924903869629, Average Training Loss: 1.6200082153081894, Training Accuracy: 0.25
Epoch 1/5, Batch Loss: 1.4675315618515015, Average Training Loss: 1.6030663649241

In [None]:
# Experiment 6 - Updated Dataset
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

# ... (your subsequent code)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4    experience free funded material module make st...
5      program made possible continue studying college
6      scholarship serve stepping stone onward success
7    help finish study without worrying tuition als...
8    need worry financial expense allowance tuition...
9    one beneficiary made lot enthusiastic came cla...
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Batch Loss: 1.6463478803634644, Average Training Loss: 1.6463478803634644, Training Accuracy: 0.28125
Epoch 1/10, Batch Loss: 1.6296244859695435, Average Training Loss: 1.637986183166504, Training Accuracy: 0.234375
Epoch 1/10, Batch Loss: 1.643173098564148, Average Training Loss: 1.6397151549657185, Training Accuracy: 0.22916666666666666
Epoch 1/10, Batch Loss: 1.5889431238174438, Average Training Loss: 1.62702214717865, Training Accuracy: 0.21875
Epoch 1/10, Batch Loss: 1.6176304817199707, Average Training Loss: 1.625143814086914, Training Accuracy: 0.20625
Epoch 1/10, Batch Loss: 1.602550745010376, Average Training Loss: 1.6213783025741577, Training Accuracy: 0.21354166666666666
Epoch 1/10, Batch Loss: 1.5920034646987915, Average Training Loss: 1.6171818971633911, Training Accuracy: 0.19642857142857142
Epoch 1/10, Batch Loss: 1.622205138206482, Average Training Loss: 1.6178098022937775, Training Accuracy: 0.1953125
Epoch 1/10, Batch Loss: 1.554190993309021, Average Train

In [None]:
# Experiment 6 - Updated Dataset
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

# ... (your subsequent code)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4    experience free funded material module make st...
5      program made possible continue studying college
6      scholarship serve stepping stone onward success
7    help finish study without worrying tuition als...
8    need worry financial expense allowance tuition...
9    one beneficiary made lot enthusiastic came cla...
Name: Processed_Response, dtype: object


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6251158714294434, Average Training Loss: 1.6251158714294434, Training Accuracy: 0.375
Epoch 1/5, Batch Loss: 1.5659782886505127, Average Training Loss: 1.595547080039978, Training Accuracy: 0.3125
Epoch 1/5, Batch Loss: 1.6158950328826904, Average Training Loss: 1.6023297309875488, Training Accuracy: 0.2708333333333333
Epoch 1/5, Batch Loss: 1.6136127710342407, Average Training Loss: 1.6051504909992218, Training Accuracy: 0.296875
Epoch 1/5, Batch Loss: 1.6251236200332642, Average Training Loss: 1.6091451168060302, Training Accuracy: 0.3
Epoch 1/5, Batch Loss: 1.4833487272262573, Average Training Loss: 1.5881790518760681, Training Accuracy: 0.3229166666666667
Epoch 1/5, Batch Loss: 1.5462197065353394, Average Training Loss: 1.5821848596845354, Training Accuracy: 0.3125
Epoch 1/5, Batch Loss: 1.6894651651382446, Average Training Loss: 1.595594897866249, Training Accuracy: 0.296875
Epoch 1/5, Batch Loss: 1.5930513143539429, Average Training Loss: 1.59531227747599

In [None]:
# ... (Previous code)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/predicted.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.693034238488784
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.64      0.78      0.70       178
                Educational Opportunity       0.53      0.59      0.56       188
                         Family Support       0.90      0.94      0.92       100
                      Financial Support       0.71      0.77      0.74       150
                 Program Implementation       0.83      0.56      0.67       231

                               accuracy                           0.69       847
                              macro avg       0.72      0.73      0.72       847
                           weighted avg       0.71      0.69      0.69       847

Test Confusion Matrix:
[[139  23   2   6   8]
 [ 39 110   5  20  14]
 [  1   2  94   3   0]
 [ 12  17   2 115   4]
 [ 27  56   1  18 129]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/pre

In [None]:
# ... (Previous code)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/predicted.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6883116883116883
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.64      0.78      0.70       178
                Educational Opportunity       0.53      0.58      0.55       188
                         Family Support       0.90      0.94      0.92       100
                      Financial Support       0.74      0.70      0.72       150
                 Program Implementation       0.77      0.59      0.67       231

                               accuracy                           0.69       847
                              macro avg       0.72      0.72      0.71       847
                           weighted avg       0.70      0.69      0.69       847

Test Confusion Matrix:
[[138  23   2   4  11]
 [ 40 109   5  15  19]
 [  1   2  94   3   0]
 [ 14  19   2 105  10]
 [ 24  54   2  14 137]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/pr

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_dataset.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Handle NaN values in the 'Label' column
df['Label'].fillna('default_label', inplace=True)

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Tokenize training and validation data
train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6358721256256104, Average Training Loss: 1.6358721256256104, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.8207361698150635, Average Training Loss: 1.728304147720337, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.5434253215789795, Average Training Loss: 1.6666778723398845, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.8463597297668457, Average Training Loss: 1.7115983366966248, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.776472568511963, Average Training Loss: 1.7245731830596924, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.559312343597412, Average Training Loss: 1.697029709815979, Training Accuracy: 0.13541666666666666
Epoch 1/5, Batch Loss: 1.5095734596252441, Average Training Loss: 1.6702502455030168, Training Accuracy: 0.15178571428571427
Epoch 1/5, Batch Loss: 1.7437807321548462, Average Training Loss: 1.6794415563344955, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.626349687576294, Average Training Loss: 1.6735424598058064

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_dataset.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Handle NaN values in the 'Label' column
df['Label'].fillna('default_label', inplace=True)

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Tokenize training and validation data
train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 15
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/15, Batch Loss: 1.748457908630371, Average Training Loss: 1.748457908630371, Training Accuracy: 0.109375
Epoch 1/15, Batch Loss: 1.596543788909912, Average Training Loss: 1.6725008487701416, Training Accuracy: 0.2109375
Epoch 1/15, Batch Loss: 1.6306307315826416, Average Training Loss: 1.6585441430409749, Training Accuracy: 0.19791666666666666
Epoch 1/15, Batch Loss: 1.6481775045394897, Average Training Loss: 1.6559524834156036, Training Accuracy: 0.1875
Epoch 1/15, Batch Loss: 1.610304594039917, Average Training Loss: 1.6468229055404664, Training Accuracy: 0.203125
Epoch 1/15, Batch Loss: 1.5896228551864624, Average Training Loss: 1.637289563814799, Training Accuracy: 0.21354166666666666
Epoch 1/15, Batch Loss: 1.6083388328552246, Average Training Loss: 1.6331537451062883, Training Accuracy: 0.22098214285714285
Epoch 1/15, Batch Loss: 1.5885634422302246, Average Training Loss: 1.6275799572467804, Training Accuracy: 0.22265625
Epoch 1/15, Batch Loss: 1.6194385290145874, Average

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_dataset.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Handle NaN values in the 'Label' column
df['Label'].fillna('default_label', inplace=True)

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Tokenize training and validation data
train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Batch Loss: 1.5916208028793335, Average Training Loss: 1.5916208028793335, Training Accuracy: 0.25
Epoch 1/10, Batch Loss: 1.607102394104004, Average Training Loss: 1.5993615984916687, Training Accuracy: 0.3125
Epoch 1/10, Batch Loss: 1.6332834959030151, Average Training Loss: 1.6106688976287842, Training Accuracy: 0.3333333333333333
Epoch 1/10, Batch Loss: 1.7683215141296387, Average Training Loss: 1.6500820517539978, Training Accuracy: 0.28125
Epoch 1/10, Batch Loss: 1.7336218357086182, Average Training Loss: 1.6667900085449219, Training Accuracy: 0.25
Epoch 1/10, Batch Loss: 1.5815730094909668, Average Training Loss: 1.6525871753692627, Training Accuracy: 0.22916666666666666
Epoch 1/10, Batch Loss: 1.6780096292495728, Average Training Loss: 1.6562189544950212, Training Accuracy: 0.21428571428571427
Epoch 1/10, Batch Loss: 1.6022987365722656, Average Training Loss: 1.6494789272546768, Training Accuracy: 0.2109375
Epoch 1/10, Batch Loss: 1.5704214572906494, Average Trainin

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_dataset.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Handle NaN values in the 'Label' column
df['Label'].fillna('default_label', inplace=True)

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Tokenize training and validation data
train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6141594648361206, Average Training Loss: 1.6141594648361206, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.5217159986495972, Average Training Loss: 1.5679377317428589, Training Accuracy: 0.21875
Epoch 1/5, Batch Loss: 1.6075111627578735, Average Training Loss: 1.5811288754145305, Training Accuracy: 0.22916666666666666
Epoch 1/5, Batch Loss: 1.6291139125823975, Average Training Loss: 1.5931251347064972, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.6105462312698364, Average Training Loss: 1.596609354019165, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.6292423009872437, Average Training Loss: 1.6020481785138447, Training Accuracy: 0.19791666666666666
Epoch 1/5, Batch Loss: 1.6729230880737305, Average Training Loss: 1.6121731655938285, Training Accuracy: 0.19642857142857142
Epoch 1/5, Batch Loss: 1.6519626379013062, Average Training Loss: 1.6171468496322632, Training Accuracy: 0.203125
Epoch 1/5, Batch Loss: 1.6739495992660522, Average Training Loss

KeyboardInterrupt: 

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_dataset.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Handle NaN values in the 'Label' column
df['Label'].fillna('default_label', inplace=True)

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Tokenize training and validation data
train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Batch Loss: 1.6330713033676147, Average Training Loss: 1.6330713033676147, Training Accuracy: 0.25
Epoch 1/10, Batch Loss: 1.6665178537368774, Average Training Loss: 1.649794578552246, Training Accuracy: 0.1875
Epoch 1/10, Batch Loss: 1.6421445608139038, Average Training Loss: 1.6472445726394653, Training Accuracy: 0.17708333333333334
Epoch 1/10, Batch Loss: 1.6814308166503906, Average Training Loss: 1.6557911336421967, Training Accuracy: 0.171875
Epoch 1/10, Batch Loss: 1.668810486793518, Average Training Loss: 1.658395004272461, Training Accuracy: 0.16875
Epoch 1/10, Batch Loss: 1.6404330730438232, Average Training Loss: 1.655401349067688, Training Accuracy: 0.16666666666666666
Epoch 1/10, Batch Loss: 1.6395388841629028, Average Training Loss: 1.6531352826527186, Training Accuracy: 0.17857142857142858
Epoch 1/10, Batch Loss: 1.6691361665725708, Average Training Loss: 1.6551353931427002, Training Accuracy: 0.171875
Epoch 1/10, Batch Loss: 1.5530335903167725, Average Traini

In [None]:
# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Test_Data.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Create DataLoader for the test set
test_data = TensorDataset(test_inputs, test_masks)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in test_dataloader:
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Calculate test accuracy and other metrics
test_accuracy = accuracy_score(true_test_labels, test_predictions)
test_classification_report = classification_report(true_test_labels, test_predictions, target_names=label_encoder.classes_)
conf_matrix = confusion_matrix(true_test_labels, test_predictions)

print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')
print('Test Classification Report:')
print(test_classification_report)
print('Confusion Matrix:')
print(conf_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/Test_Data.csvv"
test_df['Predicted_Label'] = label_encoder.inverse_transform(test_predictions)
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels for the test set saved to {predicted_csv_path}')


  text = BeautifulSoup(text, 'html.parser').get_text()


NameError: name 'true_test_labels' is not defined

In [None]:
# ... (Previous code)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/predicted.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.5759075907590759
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.52      0.60      0.56       108
                Educational Opportunity       0.41      0.48      0.44       116
                         Family Support       0.72      0.90      0.80        90
                      Financial Support       0.56      0.64      0.60       106
                 Program Implementation       0.72      0.42      0.54       186

                               accuracy                           0.58       606
                              macro avg       0.59      0.61      0.59       606
                           weighted avg       0.60      0.58      0.57       606

Test Confusion Matrix:
[[65 15  7  8 13]
 [29 56  8 13 10]
 [ 2  1 81  4  2]
 [ 3 21  9 68  5]
 [26 45  8 28 79]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/predicted.csv


In [None]:
# ... (Previous code)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/predicted.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6567656765676567
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.64      0.68      0.66       108
                Educational Opportunity       0.45      0.71      0.55       116
                         Family Support       0.92      0.87      0.89        90
                      Financial Support       0.63      0.69      0.66       106
                 Program Implementation       0.84      0.49      0.62       186

                               accuracy                           0.66       606
                              macro avg       0.70      0.69      0.68       606
                           weighted avg       0.70      0.66      0.66       606

Test Confusion Matrix:
[[73 19  1 10  5]
 [18 82  2  6  8]
 [ 3  4 78  5  0]
 [ 4 22  2 73  5]
 [16 54  2 22 92]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/predicted.csv


In [None]:
# Prediction with No True Label

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Test_Data_Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted_Label'] = label_encoder.inverse_transform(test_predictions)

# Print the predicted labels
print(test_df[['Responses', 'Predicted_Label']])

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/predicted.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


  text = BeautifulSoup(text, 'html.parser').get_text()


                                             Responses  \
0    "We were able to save more money because of th...   
1              "It gives access to quality education."   
2    "It was honestly a big help since the budget f...   
3    "The UAQTE gave students the opportunity to en...   
4    "It has given my college experience a good one...   
..                                                 ...   
596  "The UAQTE provides free access to education w...   
597  "Despite being free, quality education was sti...   
598  "I also like the fact that the selection proce...   
599  "The UAQTE Act made a significant difference i...   
600  "It opened doors to higher education that were...   

             Predicted_Label  
0          Financial Support  
1    Educational Opportunity  
2          Financial Support  
3    Educational Opportunity  
4    Educational Opportunity  
..                       ...  
596  Educational Opportunity  
597  Educational Opportunity  
598   Program Implementa

In [None]:
# ... (previous code)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Test_Data.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted_Label'] = label_encoder.inverse_transform(test_predictions)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/predicted.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')

# Assuming you have a CSV file with validated labels, load it
validated_labels_path = "/path/to/validated_labels.csv"
validated_df = pd.read_csv(validated_labels_path)

# Ensure that the order of responses in test_df matches the order in validated_df
# Assuming that 'Responses' is the common column between test_df and validated_df
test_df = test_df.merge(validated_df[['Responses', 'Validated_Label']], on='Responses', how='left')

# Calculate and print evaluation metrics
true_labels = label_encoder.transform(test_df['Validated_Label'])
predicted_labels = label_encoder.transform(test_df['Predicted_Label'])

accuracy = accuracy_score(true_labels, predicted_labels)
classification_report = classification_report(true_labels, predicted_labels, target_names=label_encoder.classes_)

print(f'Test Accuracy: {accuracy}')
print('Test Classification Report:')
print(classification_report)


In [None]:
# Experiment 1 - BatchSize-16, Epoch-1, Learning Rate-1e-5
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4    experience free funded material module make st...
5      program made possible continue studying college
6      scholarship serve stepping stone onward success
7    help finish study without worrying tuition als...
8    need worry financial expense allowance tuition...
9    one beneficiary made lot enthusiastic came cla...
Name: Processed_Response, dtype: object


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6605498790740967, Average Training Loss: 1.6605498790740967, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.7241383790969849, Average Training Loss: 1.6923441290855408, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.5882024765014648, Average Training Loss: 1.6576302448908489, Training Accuracy: 0.14583333333333334
Epoch 1/1, Batch Loss: 1.5249766111373901, Average Training Loss: 1.6244668364524841, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.5881320238113403, Average Training Loss: 1.6171998739242555, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.599433183670044, Average Training Loss: 1.6142387588818867, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6526724100112915, Average Training Loss: 1.6197292804718018, Training Accuracy: 0.17857142857142858
Epoch 1/1, Batch Loss: 1.6471292972564697, Average Training Loss: 1.6231542825698853, Training Accuracy: 0.1640625
Epoch 1/1, Batch Loss: 1.5935280323028564, Average Training Loss: 1.61986247

In [None]:
# Experiment 2 - BatchSize-16, Epoch-1, Learning Rate-1e-5
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4    experience free funded material module make st...
5      program made possible continue studying college
6      scholarship serve stepping stone onward success
7    help finish study without worrying tuition als...
8    need worry financial expense allowance tuition...
9    one beneficiary made lot enthusiastic came cla...
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.642849326133728, Average Training Loss: 1.642849326133728, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6452349424362183, Average Training Loss: 1.6440421342849731, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.6601519584655762, Average Training Loss: 1.6494120756785076, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.7175464630126953, Average Training Loss: 1.6664456725120544, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.5867669582366943, Average Training Loss: 1.6505099296569825, Training Accuracy: 0.1375
Epoch 1/1, Batch Loss: 1.5878355503082275, Average Training Loss: 1.6400641997655232, Training Accuracy: 0.16666666666666666
Epoch 1/1, Batch Loss: 1.5517346858978271, Average Training Loss: 1.6274456977844238, Training Accuracy: 0.20535714285714285
Epoch 1/1, Batch Loss: 1.5813757181167603, Average Training Loss: 1.6216869503259659, Training Accuracy: 0.2265625
Epoch 1/1, Batch Loss: 1.5881948471069336, Average Training Loss: 1.6179656

In [None]:
# Experiment 2 - BatchSize-16, Epoch-1, Learning Rate-1e-5
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4    experience free funded material module make st...
5      program made possible continue studying college
6      scholarship serve stepping stone onward success
7    help finish study without worrying tuition als...
8    need worry financial expense allowance tuition...
9    one beneficiary made lot enthusiastic came cla...
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.666519045829773, Average Training Loss: 1.666519045829773, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.6296865940093994, Average Training Loss: 1.6481028199195862, Training Accuracy: 0.171875
Epoch 1/1, Batch Loss: 1.6017796993255615, Average Training Loss: 1.6326617797215779, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.5643727779388428, Average Training Loss: 1.6155895292758942, Training Accuracy: 0.22265625
Epoch 1/1, Batch Loss: 1.5373390913009644, Average Training Loss: 1.5999394416809083, Training Accuracy: 0.24375
Epoch 1/1, Batch Loss: 1.5280025005340576, Average Training Loss: 1.5879499514897664, Training Accuracy: 0.24739583333333334
Epoch 1/1, Batch Loss: 1.5029752254486084, Average Training Loss: 1.5758107049124581, Training Accuracy: 0.27232142857142855
Epoch 1/1, Batch Loss: 1.4641032218933105, Average Training Loss: 1.5618472695350647, Training Accuracy: 0.2890625
Epoch 1/1, Batch Loss: 1.4384347200393677, Average Training Loss: 1

In [None]:
# Experiment 2 - BatchSize-16, Epoch-1, Learning Rate-1e-5
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4    experience free funded material module make st...
5      program made possible continue studying college
6      scholarship serve stepping stone onward success
7    help finish study without worrying tuition als...
8    need worry financial expense allowance tuition...
9    one beneficiary made lot enthusiastic came cla...
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6754621267318726, Average Training Loss: 1.6754621267318726, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.6339415311813354, Average Training Loss: 1.654701828956604, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.6925803422927856, Average Training Loss: 1.6673280000686646, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.6836494207382202, Average Training Loss: 1.6714083552360535, Training Accuracy: 0.1953125
Epoch 1/1, Batch Loss: 1.557828664779663, Average Training Loss: 1.6486924171447754, Training Accuracy: 0.2125
Epoch 1/1, Batch Loss: 1.5971646308898926, Average Training Loss: 1.6401044527689617, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.5953556299209595, Average Training Loss: 1.6337117637906755, Training Accuracy: 0.21428571428571427
Epoch 1/1, Batch Loss: 1.5597234964370728, Average Training Loss: 1.6244632303714752, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.6012053489685059, Average Training Loss: 1.621879021326701, Tra

In [None]:
# Experiment 2 - BatchSize-16, Epoch-1, Learning Rate-1e-5
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4    experience free funded material module make st...
5      program made possible continue studying college
6      scholarship serve stepping stone onward success
7    help finish study without worrying tuition als...
8    need worry financial expense allowance tuition...
9    one beneficiary made lot enthusiastic came cla...
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.664396047592163, Average Training Loss: 1.664396047592163, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.668453335762024, Average Training Loss: 1.6664246916770935, Training Accuracy: 0.234375
Epoch 1/1, Batch Loss: 1.5615029335021973, Average Training Loss: 1.6314507722854614, Training Accuracy: 0.2708333333333333
Epoch 1/1, Batch Loss: 1.5911309719085693, Average Training Loss: 1.6213708221912384, Training Accuracy: 0.265625
Epoch 1/1, Batch Loss: 1.6556731462478638, Average Training Loss: 1.6282312870025635, Training Accuracy: 0.23125
Epoch 1/1, Batch Loss: 1.6124444007873535, Average Training Loss: 1.6256001393000286, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.612802505493164, Average Training Loss: 1.6237719058990479, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.632321834564209, Average Training Loss: 1.624840646982193, Training Accuracy: 0.22265625
Epoch 1/1, Batch Loss: 1.6076148748397827, Average Training Loss: 1.6229266722997029, T

In [None]:
# Experiment 2 - BatchSize-16, Epoch-1, Learning Rate-1e-5
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4    experience free funded material module make st...
5      program made possible continue studying college
6      scholarship serve stepping stone onward success
7    help finish study without worrying tuition als...
8    need worry financial expense allowance tuition...
9    one beneficiary made lot enthusiastic came cla...
Name: Processed_Response, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Batch Loss: 1.6228388547897339, Average Training Loss: 1.6228388547897339, Training Accuracy: 0.125
Epoch 1/10, Batch Loss: 1.74497389793396, Average Training Loss: 1.683906376361847, Training Accuracy: 0.125
Epoch 1/10, Batch Loss: 1.5916099548339844, Average Training Loss: 1.653140902519226, Training Accuracy: 0.14583333333333334
Epoch 1/10, Batch Loss: 1.624433159828186, Average Training Loss: 1.645963966846466, Training Accuracy: 0.15625
Epoch 1/10, Batch Loss: 1.578614592552185, Average Training Loss: 1.6324940919876099, Training Accuracy: 0.175
Epoch 1/10, Batch Loss: 1.6372566223144531, Average Training Loss: 1.6332878470420837, Training Accuracy: 0.1875
Epoch 1/10, Batch Loss: 1.6162292957305908, Average Training Loss: 1.630850911140442, Training Accuracy: 0.19642857142857142
Epoch 1/10, Batch Loss: 1.6270800828933716, Average Training Loss: 1.630379557609558, Training Accuracy: 0.1796875
Epoch 1/10, Batch Loss: 1.5234469175338745, Average Training Loss: 1.618498153

In [None]:
# Experiment 2 - BatchSize-16, Epoch-1, Learning Rate-1e-5
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()


0    focused studying much think could pay tuition fee
1            also helped become responsible many thing
2    every student help make thier dream become pos...
3           became serious studying fail graduate time
4    experience free funded material module make st...
5      program made possible continue studying college
6      scholarship serve stepping stone onward success
7    help finish study without worrying tuition als...
8    need worry financial expense allowance tuition...
9    one beneficiary made lot enthusiastic came cla...
Name: Processed_Response, dtype: object


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6391780376434326, Average Training Loss: 1.6391780376434326, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.7085189819335938, Average Training Loss: 1.6738485097885132, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.610846996307373, Average Training Loss: 1.6528480052947998, Training Accuracy: 0.16666666666666666
Epoch 1/5, Batch Loss: 1.6213816404342651, Average Training Loss: 1.6449814140796661, Training Accuracy: 0.171875
Epoch 1/5, Batch Loss: 1.5468406677246094, Average Training Loss: 1.6253532648086548, Training Accuracy: 0.2125
Epoch 1/5, Batch Loss: 1.5807524919509888, Average Training Loss: 1.6179198026657104, Training Accuracy: 0.21875
Epoch 1/5, Batch Loss: 1.5779796838760376, Average Training Loss: 1.6122140714100428, Training Accuracy: 0.22321428571428573
Epoch 1/5, Batch Loss: 1.5936646461486816, Average Training Loss: 1.6098953932523727, Training Accuracy: 0.2421875
Epoch 1/5, Batch Loss: 1.6868716478347778, Average Training Loss: 1.6184483

In [None]:
# Experiment 2 - BatchSize-16, Epoch-1, Learning Rate-1e-5
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup  # For removing HTML tags
from contractions import contractions_dict  # You may need to install the contractions library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Add this line to download the 'punkt' resource

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Print 10 preprocessed responses
print(df['Processed_Response'].head(10))

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)
