In [2]:
import  pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [3]:
# Directly Downloading dataset from kaggle and loading 
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("tarkkaanko/amazon")

print("Path to dataset files:", os.listdir(path))

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: ['amazon_reviews.csv~', 'amazon_reviews.csv']


# 1. Data Preprocessing

In [4]:
df = pd.read_csv(f'{path}/amazon_reviews.csv')
print(df.shape)
df.head(5)

(4915, 12)


Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


In [12]:
# Removing the other nucessary columns
df = df[['reviewText','overall']]

# Convert ratings to binary labels (-1=negative, 0=neutral, 1=positive)
df['label'] = df['overall'].apply(lambda x: -1 if x <= 2 else 0 if x == 3 else 1 if x>3 else None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['overall'].apply(lambda x: -1 if x <= 2 else 0 if x == 3 else 1 if x>3 else None)


In [13]:
df.head()

Unnamed: 0,reviewText,overall,label
0,No issues.,4.0,1
1,"Purchased this for my device, it worked as adv...",5.0,1
2,it works as expected. I should have sprung for...,4.0,1
3,This think has worked out great.Had a diff. br...,5.0,1
4,"Bought it with Retail Packaging, arrived legit...",5.0,1


In [14]:
df['label'].value_counts()

label
 1    4449
-1     324
 0     142
Name: count, dtype: int64

In [15]:
def clean_text(text):
    # Checks if the text are actully string or not
    if not isinstance(text, str):
        text = str(text)
    # Convet to lowerCase
    text = text.lower()
    # Remove Urls
    #text = re.sub('https?://\S+|www\.\S+','',text)
    # Remove Html tags
    #text = re.sub(r'<.*?>','',text)
    # Remove special characters ann numbers
    text = re.sub(r'[^a-zA-Z\s]','',text)

    # Split the sentences into words
    text= text.split()

    # Applying stopwords and lemmatization
    text = [lemmatizer.lemmatize(word) for word in text if not word in stopwords.words('english')]

    # Remove extra whitespace
    text = ' '.join(text)
    return text

df['cleaned_reviewText'] = df['reviewText'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_reviewText'] = df['reviewText'].apply(clean_text)


In [18]:
df.head()

Unnamed: 0,reviewText,overall,label,cleaned_reviewText
0,No issues.,4.0,1,issue
1,"Purchased this for my device, it worked as adv...",5.0,1,purchased device worked advertised never much ...
2,it works as expected. I should have sprung for...,4.0,1,work expected sprung higher capacity think mad...
3,This think has worked out great.Had a diff. br...,5.0,1,think worked greathad diff bran gb card went s...
4,"Bought it with Retail Packaging, arrived legit...",5.0,1,bought retail packaging arrived legit orange e...


In [19]:
df.isnull().sum()

reviewText            1
overall               0
label                 0
cleaned_reviewText    0
dtype: int64

In [20]:
df.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


# 2. Tokenization (No Manual Vectorization Needed for Transformers)

#### Transformers handle vectorization internally through their tokenizers:

In [21]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['cleaned_reviewText'],padding='max_length',truncation= True)

# Filling the nan valuer with empth string
df['cleaned_reviewText'] = df['cleaned_reviewText'].fillna('').astype(str)

# Converting to HuggingFace Dataset format
from datasets import Dataset
dataset = Dataset.from_pandas(df)
tokenize_dataset = dataset.map(tokenize_function,batched=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_reviewText'] = df['cleaned_reviewText'].fillna('').astype(str)
Map: 100%|██████████| 4914/4914 [00:00<00:00, 6385.42 examples/s]


In [22]:
print(tokenize_dataset)

Dataset({
    features: ['reviewText', 'overall', 'label', 'cleaned_reviewText', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 4914
})


In [23]:
print(tokenize_dataset.features)

{'reviewText': Value(dtype='string', id=None), 'overall': Value(dtype='float64', id=None), 'label': Value(dtype='int64', id=None), 'cleaned_reviewText': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


# 3. Model Traning using Transformers Models

In [30]:
from transformers import AutoModelForSequenceClassification, TrainingArguments , Trainer
import numpy as np
import evaluate

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels= 3 # hyperparamerter
)

#split dataset
train_test = tokenize_dataset.train_test_split(test_size=0.2)

# Metrics 
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits,labels = eval_pred
    predictions = np.argmax(logits,axis = -1)
    return metric.compute(predictions=predictions,references = labels)

# Training arguments
training_args = TrainingArguments(
    output_dir = './results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_eval_batch_size=16,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to='none' #disable wandb/etc for simplicty

)
# Treiner
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=train_test['train'],
    eval_dataset=train_test['test'],
    compute_metrics= compute_metrics
)

# Train
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`