In [None]:
from Code.build_prompt import build_examples_prompt
from Code.utils import read_list_from_file, df_from_file, multiple_df
from Code.pretrained_model import load_model_and_tokenizer, predict_with_loaded_model, predict_column, calculate_accuracy

### Import Libraries

In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.nn.functional import softmax
import numpy as np
from sklearn.metrics import accuracy_score

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm
import pandas as pd
from transformers import TFAutoModel, BertTokenizer

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax

### Pre-trained Main

In [None]:
df = pd.read_csv("./Data/Merged_file.csv")

# Define category mapping
category_mapping = {0: 'AC', 1: "PC", 2: "TC", 3: "NC"}

# Convert 'concerns category' and 'anything else category' to numerical values
df['ground_truth_concerns'] = df['concerns category'].map({v: k for k, v in category_mapping.items()})
df['ground_truth_anything_else'] = df['anything else category'].map({v: k for k, v in category_mapping.items()})

# Check for and handle NaN values in the ground truth columns
df['ground_truth_concerns'] = df['ground_truth_concerns'].fillna(-1).astype(int)
df['ground_truth_anything_else'] = df['ground_truth_anything_else'].fillna(-1).astype(int)

# Predict categories for the 'concerns' column
df = predict_column(df, category_mapping, "concerns")

# Predict categories for the 'anything else' column
df = predict_column(df, category_mapping, "anything else")

# Calculate accuracy if ground truth is available
accuracy_concerns = calculate_accuracy(df, "concerns", "ground_truth_concerns")
accuracy_anything_else = calculate_accuracy(df, "anything else", "ground_truth_anything_else")

# Print accuracy results
print(f"Accuracy for 'concerns': {accuracy_concerns}")
print(f"Accuracy for 'anything else': {accuracy_anything_else}")

# Save the results to an Excel file
path = 'result_testDataLabeled.xlsx'
df.to_excel(path, index=False)

### Main

In [None]:
#ret = multiple_df(["ac.txt", "timeManagement.txt", "OTHER.txt", "LM.txt"])
ret = multiple_df(["AC.txt", "PC.txt", "TC.txt", "NC.txt"])
shuffled_df = ret.sample(frac=1).reset_index(drop=True)
shuffled_df

In [None]:
#Loads my tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#Loads my data frame
df = shuffled_df

#Puts my data frame in a good format for ML
#Max length is 512
#encoded_data = tokenizer(df['response'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')
#labels = torch.tensor(df['category'].tolist())

### Train the model

In [None]:
# Load your DataFrame
df = shuffled_df

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text and encode labels
encoded_data = tokenizer(df['response'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')  # Reduced max_length for speed
labels = torch.tensor(df['category_int'].tolist())

# Train-Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(encoded_data['input_ids'],
                                                                     labels,
                                                                     test_size=0.2,
                                                                     random_state=42)

# Create DataLoader for training
train_dataset = TensorDataset(train_texts, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)

# Create DataLoader for testing
test_dataset = TensorDataset(test_texts, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['category_int'].unique()))

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 3
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    epoch_loss = 0
    for batch in tqdm(train_dataloader):
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass
        outputs = model(input_ids=inputs, labels=labels)
        loss = outputs.loss
        epoch_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_epoch_loss = epoch_loss / len(train_dataloader)
    print(f"Average Epoch Loss: {avg_epoch_loss:.4f}")

# Evaluation loop (optional)
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_dataloader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(input_ids=inputs)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

### Save the trained model

In [None]:
# Save the model to a file
torch.save(model.state_dict(), 'bert_model.pth')

# Load the model from the file
loaded_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
loaded_model.load_state_dict(torch.load('bert_model.pth'))
loaded_model.eval()  # Set the model to evaluation model

### Evaluation of the trained model

In [None]:
# Example text for evaluation
text = "I'm concerned about not remembering stuff from calc"

# Tokenize and encode the text
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Forward pass through the model
with torch.no_grad():
    outputs = model(**inputs)

# Get predicted probabilities (you may need to adjust this based on your specific task)
probs = softmax(outputs.logits, dim=1)

# Get the predicted class
predicted_class = torch.argmax(probs, dim=1).item()

# Print the results
print(f"Predicted class: {predicted_class}")
print(f"Class probabilities: {probs.tolist()}")