In [None]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
"""class Data(Dataset):
  def __init__(self,data_path:str = "df_file.csv"):
    super().__init__()

    self.data = pd.read_csv(data_path)
    self.x = self.data["Text"]
    self.y = self.data["Label"]

  def __len__(self):
    return len(self.data)

  def __getitem__(self,index):
    return {"Text":self.x[index],"Label":self.y[index]}"""

In [None]:
df = pd.read_csv("df_file.csv")
raw_data = Dataset.from_pandas(df)

In [None]:
df["Label"].unique()

array([0, 1, 2, 3, 4])

In [None]:
sample_text = next(iter(raw_data))
sample_text

{'Text': 'Budget to set scene for election\n \n Gordon Brown will seek to put the economy at the centre of Labour\'s bid for a third term in power when he delivers his ninth Budget at 1230 GMT. He is expected to stress the importance of continued economic stability, with low unemployment and interest rates. The chancellor is expected to freeze petrol duty and raise the stamp duty threshold from Â£60,000. But the Conservatives and Lib Dems insist voters face higher taxes and more means-testing under Labour.\n \n Treasury officials have said there will not be a pre-election giveaway, but Mr Brown is thought to have about Â£2bn to spare.\n \n - Increase in the stamp duty threshold from Â£60,000 \n  - A freeze on petrol duty \n  - An extension of tax credit scheme for poorer families \n  - Possible help for pensioners The stamp duty threshold rise is intended to help first time buyers - a likely theme of all three of the main parties\' general election manifestos. Ten years ago, buyers had

In [None]:
from transformers import AutoTokenizer,DataCollatorWithPadding
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(example["Text"],truncation=True)

tokenized_dataset = raw_data.map(tokenize_function,batched=True)

# Rename the 'Label' column to 'labels'
tokenized_dataset = tokenized_dataset.rename_column("Label", "labels")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer,return_tensors="pt")

Map:   0%|          | 0/2225 [00:00<?, ? examples/s]

In [None]:
try:
  import evaluate
except:
  !pip install evaluate
  import evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
def compute_metrics(eval_pred):
  metrics = evaluate.load("accuracy")
  logits,labels = eval_pred
  pred = np.argmax(logits,axis=1)
  return metrics.compute(predictions=pred,references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
len(tokenized_dataset)

2225

In [None]:
data_split = tokenized_dataset.train_test_split(test_size=0.2,seed=42)
len(data_split["train"]),len(data_split["test"])

(1780, 445)

In [None]:
train,val = data_split["train"],data_split["test"]

In [None]:
from transformers import TrainingArguments,Trainer
training_args = TrainingArguments(output_dir="test-trainer",
                                  eval_strategy = "epoch",
                                  weight_decay = 5e-5,)
trainer=Trainer(model=model,
                args=training_args,
                processing_class=tokenizer,
                compute_metrics=compute_metrics,
                data_collator=data_collator,
                train_dataset=train,
                eval_dataset=val)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.229652,0.966292
2,No log,0.061561,0.991011
3,0.066200,0.085117,0.986517


Downloading builder script: 0.00B [00:00, ?B/s]

TrainOutput(global_step=669, training_loss=0.05206210125009397, metrics={'train_runtime': 601.6745, 'train_samples_per_second': 8.875, 'train_steps_per_second': 1.112, 'total_flos': 1400906770013952.0, 'train_loss': 0.05206210125009397, 'epoch': 3.0})

In [None]:
import pandas as pd
df = pd.read_csv("multilingual_mobile_app_reviews_2025.csv")
df.head()

Unnamed: 0,review_id,user_id,app_name,app_category,review_text,review_language,rating,review_date,verified_purchase,device_type,num_helpful_votes,user_age,user_country,user_gender,app_version
0,1,1967825,MX Player,Travel & Local,Qui doloribus consequuntur. Perspiciatis tempo...,no,1.3,2024-10-09 19:26:40,True,Android Tablet,65,14.0,China,Female,1.4
1,2,9242600,Tinder,Navigation,"Great app but too many ads, consider premium v...",ru,1.6,2024-06-21 17:29:40,True,iPad,209,18.0,Germany,Male,8.9
2,3,7636477,Netflix,Dating,The interface could be better but overall good...,es,3.6,2024-10-31 13:47:12,True,iPad,163,67.0,Nigeria,Male,2.8.37.5926
3,4,209031,Venmo,Productivity,"Latest update broke some features, please fix ...",vi,3.8,2025-03-12 06:16:22,True,iOS,664,66.0,India,Female,10.2
4,5,7190293,Google Drive,Education,"Perfect for daily use, highly recommend to eve...",tl,3.2,2024-04-21 03:48:27,True,iPad,1197,40.0,South Korea,Prefer not to say,4.7


In [None]:
df.columns

Index(['review_id', 'user_id', 'app_name', 'app_category', 'review_text',
       'review_language', 'rating', 'review_date', 'verified_purchase',
       'device_type', 'num_helpful_votes', 'user_age', 'user_country',
       'user_gender', 'app_version'],
      dtype='object')

In [None]:
df["app_category"].value_counts()

Unnamed: 0_level_0,count
app_category,Unnamed: 1_level_1
Entertainment,167
Navigation,161
Travel & Local,159
Health & Fitness,155
Music & Audio,152
Business,150
Productivity,140
Dating,140
Video Players & Editors,139
Social Networking,139


In [None]:
from datasets import Dataset
raw_data = Dataset.from_pandas(df)

In [None]:
raw_data.column_names

['review_id',
 'user_id',
 'app_name',
 'app_category',
 'review_text',
 'review_language',
 'rating',
 'review_date',
 'verified_purchase',
 'device_type',
 'num_helpful_votes',
 'user_age',
 'user_country',
 'user_gender',
 'app_version']

In [None]:
from transformers import AutoTokenizer,DataCollatorWithPadding
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

column_names = raw_data.column_names
def tokenize_function(example):
  combined_texts = [" ".join(str(value) for value in row)
                    for row in zip(*(example[col] for col in column_names))]
  return tokenizer(combined_texts,truncation=True,padding="max_length")

tokenized_dataset = raw_data.map(tokenize_function,batched=True)
tokenized_dataset = tokenized_dataset.rename_column("app_category","labels")

label_list = sorted(list(set(tokenized_dataset["labels"])))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}

def encode_labels(example):
  return {"labels": label_to_id[example["labels"]]}

tokenized_dataset = tokenized_dataset.map(encode_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2514 [00:00<?, ? examples/s]

Map:   0%|          | 0/2514 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_split = tokenized_dataset.train_test_split(test_size=0.2,seed=42)
train_dataset,val_dataset = data_split["train"],data_split["test"]

In [None]:
len(train_dataset),len(val_dataset)

(2011, 503)

In [None]:
train_dataset["labels"][1]

3

In [None]:
from transformers import TrainingArguments,Trainer

In [None]:
try:
  import evaluate
except:
  !pip install evaluate
  import evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
import numpy as np
import evaluate

def compute_metrics(eval_pred):
  metrics_accuracy = evaluate.load("accuracy")
  metrics_f1 = evaluate.load("f1")
  logits, labels = eval_pred
  pred = np.argmax(logits, axis=1)
  accuracy = metrics_accuracy.compute(predictions=pred, references=labels)
  f1 = metrics_f1.compute(predictions=pred, references=labels, average="weighted")
  return {**accuracy, **f1}

In [None]:
df["app_category"].nunique()

18

In [None]:
unique_labels = set(tokenized_dataset["labels"])
num_labels=len(unique_labels)
num_labels

18

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=num_labels)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="test-trainer",
    eval_strategy="epoch",
    num_train_epochs=3.0,
    weight_decay=5e-6,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics

)

In [None]:
#wanbd_key = d42b379ded53b597402fcd716c31e6b9ba7c26fb

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.000155,1.0,1.0
2,0.000400,7.9e-05,1.0,1.0
3,0.000400,6.6e-05,1.0,1.0


TrainOutput(global_step=756, training_loss=0.00032737604022104903, metrics={'train_runtime': 650.9143, 'train_samples_per_second': 9.269, 'train_steps_per_second': 1.161, 'total_flos': 1587577031645184.0, 'train_loss': 0.00032737604022104903, 'epoch': 3.0})