# Review Sentiment Analysis

## 1. Load & preprocess data.

### Importing Libraries

In [None]:
import pandas as pd
from datasets import Dataset
import re
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback,AutoModelForSequenceClassification,AutoTokenizer

import evaluate
import numpy as np


### Loading Data

In [3]:
# Load dataset from a CSV file
data = pd.read_csv('../data/amazon.csv')


In [7]:
data.columns

Index(['Text', 'label'], dtype='object')

In [8]:
data.head()

Unnamed: 0,Text,label
0,This is the best apps acording to a bunch of ...,1
1,This is a pretty good version of the game for ...,1
2,this is a really . there are a bunch of levels...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1


### Preprocessing

In [12]:
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

In [13]:
data['Text'] = data['Text'].apply(clean_text)

### Training data into train and test dataframe

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data["Text"], data["label"], test_size=0.3, random_state=42,shuffle=True,)

train_df = pd.DataFrame({"Text": X_train, "label": y_train})
test_df = pd.DataFrame({"Text": X_test, "label": y_test})

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


## 2. Tokenize using Hugging Face tokenizer.

In [15]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")

def preprocess_function(examples):
    return tokenizer(examples["Text"], truncation=True, padding=True, max_length=512)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/13997 [00:00<?, ? examples/s]

Map:   0%|          | 0/5999 [00:00<?, ? examples/s]

In [16]:
tokenized_train

Dataset({
    features: ['Text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 13997
})

In [17]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilroberta-base", num_labels=2)  

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [19]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Take the highest probability as the predicted class
    return accuracy_metric.compute(predictions=predictions, references=labels)


## 3. Fine-tune `distilroberta-base`.

In [15]:
# Training Arguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",                  # Evaluate after every epoch
    learning_rate=1e-6,                     # Learning rate
    per_device_train_batch_size=32,         # Batch size for training
    per_device_eval_batch_size=32,          # Batch size for evaluation
    num_train_epochs=10,                    # Number of epochs
    weight_decay=0.01,                      # Regularization
    logging_dir="./logs",                   # Directory for logging
    logging_steps=100,                      # Log after every 100 steps
    save_total_limit=2,                     # Keep only 2 checkpoints
    load_best_model_at_end=True,            # Load the best model
    save_strategy="epoch",                  # Save model after every epoch
    report_to="none",                       # Disable W&B or other integrations
    gradient_accumulation_steps=2,          # Accumulate gradients to handle smaller batches
    warmup_ratio=0.1                        # Warmup learning rate for stability
)

# Early Stopping Callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,  # Stop training if no improvement after 2 evals
    early_stopping_threshold=0.0  # Minimum improvement required
)

# Define Trainer
trainer = Trainer(
    model=model,                            # The pre-trained model
    args=training_args,                     # Training arguments
    train_dataset=tokenized_train,          # Training data
    eval_dataset=tokenized_test,            # Evaluation data
    tokenizer=tokenizer,                    # Tokenizer for data processing
    compute_metrics=compute_metrics,        # Evaluation metrics          
)

# Train the Model
trainer.train()



  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1074,0.157041,0.946824
2,0.101,0.159168,0.945991
3,0.1039,0.161691,0.947825
4,0.0954,0.158832,0.947491
5,0.0938,0.160742,0.947325
6,0.0931,0.158928,0.947991
7,0.0961,0.160353,0.948325
8,0.0853,0.161543,0.947491
9,0.0973,0.160497,0.948658
10,0.0983,0.160208,0.948325


TrainOutput(global_step=2190, training_loss=0.0970242718039038, metrics={'train_runtime': 906.6841, 'train_samples_per_second': 154.376, 'train_steps_per_second': 2.415, 'total_flos': 4487733636372624.0, 'train_loss': 0.0970242718039038, 'epoch': 10.0})

## 4. Evaluate accuracy.

In [16]:
# Evaluate the Model
evaluation_results = trainer.evaluate()
print(f"Evaluation Results: {evaluation_results}")


Evaluation Results: {'eval_loss': 0.15704137086868286, 'eval_accuracy': 0.9468244707451242, 'eval_runtime': 8.6977, 'eval_samples_per_second': 689.721, 'eval_steps_per_second': 21.615, 'epoch': 10.0}


## 5. Save trained model.

In [17]:
# Save the model and tokenizer
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")



('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.json',
 './saved_model/merges.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [None]:
# This is just for saving the model from kaggle

# import shutil

# shutil.make_archive("saved_model", 'zip', "./saved_model")

# from IPython.display import FileLink
# FileLink("saved_model.zip")
