# New Section

In [2]:
import pandas as pd

# Load the dataset
file_path = "/content/[Data] Novo Reviews - Novo's Trust Pilot Ratings.xls"

try:
    # Try to read as a CSV file
    df = pd.read_csv(file_path)

    # Display the first few rows of the dataset
    print(df.head())
except pd.errors.ParserError as e:
    print(f"ParserError: {e}")
except Exception as e:
    print(f"Error: {e}")


                   rating                 rating title  \
0  Rated 5 out of 5 stars    Great bank for businesses   
1  Rated 3 out of 5 stars  Buggy when it matters most.   
2  Rated 5 out of 5 stars         Love digital banking   
3  Rated 3 out of 5 stars    Poor support and no wires   
4  Rated 4 out of 5 stars        Really easy to set up   

                                         Review text   Review date  \
0  It was easy to set up, with no hassle like som...  Feb 15, 2024   
1  Unfortunately I’m probably going to figure out...  Feb 19, 2024   
2  Love digital banking I keep now all my busines...   Feb 1, 2024   
3  A decent basic free business bank, but have re...   Feb 8, 2024   
4  Really easy to set up. Works as intended most ...  Jan 10, 2024   

  Date of Experience  rating_procesed  Year of review   Year of experience  \
0  December 01, 2023                5             2024                2023   
1  February 18, 2024                3             2024                2024

In [3]:
# Display the columns in the dataframe
df.columns

# Combining 'rating title' and 'review text' into a single column for text analysis
df['text'] = df['rating title'].astype(str) + " " + df['Review text'].astype(str)

# Assuming we have a manually labeled 'intent' column based on the given taxonomy
# Here we simulate the labeling process (you would replace this with actual labels)
df['intent'] = None  # Placeholder for actual intent labels

# For this example, let's add some dummy intents based on hypothetical analysis
# You should replace this with actual labels
import random
taxonomy = [
    "Account->Lost password",
    "Checks->Mobile deposits->Void checks",
    "Debit card->Declined",
    "Debit card->Unauthorized transactions->fraud",
    "Invoices->sent->unpaid->conflict",
    "Invoices->sent->paid",
    "Invoices->sent->unpaid->pending"
]

df['intent'] = df['intent'].apply(lambda x: random.choice(taxonomy))

# Drop rows with missing intents if any
df.dropna(subset=['intent'], inplace=True)

# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['intent'], test_size=0.2, random_state=42)



In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Create a pipeline with a TF-IDF vectorizer and a logistic regression classifier
pipeline = make_pipeline(TfidfVectorizer(stop_words='english'), LogisticRegression(max_iter=1000))

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

                                              precision    recall  f1-score   support

                      Account->Lost password       0.16      0.16      0.16        91
        Checks->Mobile deposits->Void checks       0.11      0.14      0.12        96
                        Debit card->Declined       0.12      0.14      0.13       107
Debit card->Unauthorized transactions->fraud       0.16      0.15      0.15       121
                        Invoices->sent->paid       0.16      0.11      0.13        95
            Invoices->sent->unpaid->conflict       0.10      0.09      0.10        97
             Invoices->sent->unpaid->pending       0.12      0.12      0.12        88

                                    accuracy                           0.13       695
                                   macro avg       0.13      0.13      0.13       695
                                weighted avg       0.13      0.13      0.13       695



In [5]:
!pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [6]:
!pip install accelerate -U



In [7]:
!pip install torch -U

Collecting torch
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting triton==2.3.1 (from torch)
  Downloading triton-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (168.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.1/168.1 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: triton, torch
  Attempting uninstall: triton
    Found existing installation: triton 2.3.0
    Uninstalling triton-2.3.0:
      Successfully uninstalled triton-2.3.0
  Attempting uninstall: torch
    Found existing installation: torch 2.3.0+cu121
    Uninstalling torch-2.3.0+cu121:
      Successfully uninstalled torch-2.3.0+cu121
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency c

In [8]:
!pip install torch==2.3.0
!pip install torchaudio==2.3.0+cu121 torchvision==0.18.0+cu121


Collecting torch==2.3.0
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting triton==2.3.0 (from torch==2.3.0)
  Downloading triton-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (168.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.1/168.1 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: triton, torch
  Attempting uninstall: triton
    Found existing installation: triton 2.3.1
    Uninstalling triton-2.3.1:
      Successfully uninstalled triton-2.3.1
  Attempting uninstall: torch
    Found existing installation: torch 2.3.1
    Uninstalling torch-2.3.1:
      Successfully uninstalled torch-2.3.1
Successfully installed torch-2.3.0 triton-2.3.0


In [18]:
class FeedbackDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = int(self.labels[item])  # Convert label to integer

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [24]:
class FeedbackDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        # Define the label mapping here
        self.label_mapping = {label: idx for idx, label in enumerate(df['intent'].unique())}

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        # Check if the label is already an integer, if not, map it
        label = self.labels[item]
        if not isinstance(label, int):
            label = self.label_mapping.get(label, -1) # -1 for unknown labels

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

class FeedbackDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        # Define the label mapping here
        self.label_mapping = {label: idx for idx, label in enumerate(df['intent'].unique())}

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        # Check if the label is already an integer, if not, map it
        label = self.labels[item]
        if not isinstance(label, int):
            label = self.label_mapping.get(label, -1) # -1 for unknown labels

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['intent'].unique()))

# Prepare the dataset
train_dataset = FeedbackDataset(X_train.to_list(), y_train.to_list(), tokenizer, max_len=160)
test_dataset = FeedbackDataset(X_test.to_list(), y_test.to_list(), tokenizer, max_len=160)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,2.0026


In [None]:
from google.colab import drive
drive.mount('/content/drive')