In [7]:
# !pip install pytesseract transformers datasets evaluate rouge-score nltk tensorboard py7zr --upgrade
!pip install pytesseract transformers==4.28.1 datasets evaluate rouge-score nltk tensorboard py7zr 

Collecting transformers==4.28.1
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting py7zr
  Downloading py7zr-0.20.6-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting pycryptodomex>=3.6.6 (from py7zr)
  Downloading pycryptodomex-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m


In [1]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset, DatasetDict
from datasets import Dataset
from datasets import concatenate_datasets



In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from tqdm.auto import tqdm
import re

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Read preprocessed dataset

In [4]:
test_dataset = pd.read_csv("./unlabelled_dataset.csv")
test_dataset = test_dataset.dropna()

The dataset, which consists of comments labeled as 1 and 0, was acquired from Kaggle.
This dataset will be used as training data for the pre-trained model.

In [5]:
train_dataset = pd.read_csv("./hepsiburada.csv")
train_dataset = train_dataset.dropna()

The dataset obtained from Kaggle has been adjusted to align with the format of the dataset we intend to label.

In [6]:
train_dataset["Rating"] = train_dataset["Rating"].map({0: "negative", 1: "positive"})

In [7]:
train_dataset.head()

Unnamed: 0,Rating,Review
0,positive,3 yıldır tık demedi. :)
1,positive,3 yıldır kullanıyorum müthiş
2,positive,Ürün bugün elime geçti çok fazla inceleme fırs...
3,positive,Almaya karar verdim. Hemencecik geldi. Keyifle...
4,positive,Günlük kullanımınızı çok çok iyi karsılıyor kı...


In [8]:
train_dataset = train_dataset.rename(columns={'Review': 'review', 'Rating': 'sentiment'})

In [9]:
train_dataset.head()

Unnamed: 0,sentiment,review
0,positive,3 yıldır tık demedi. :)
1,positive,3 yıldır kullanıyorum müthiş
2,positive,Ürün bugün elime geçti çok fazla inceleme fırs...
3,positive,Almaya karar verdim. Hemencecik geldi. Keyifle...
4,positive,Günlük kullanımınızı çok çok iyi karsılıyor kı...


Preprocessing hepsiburada dataset

In [10]:
def preprocessing(df):
    preprocessed_reviews = []
    sw = set(stopwords.words("turkish"))
    for review in tqdm(df['review'].values):
        review = re.sub(r'[^\w\s]', '', review)
        review = re.sub(r'http\S+', '', review)
#         review = re.sub(r'quoteOrjinalden alıntı\s+\w+', '', review)
        review = re.sub(r'\S*\d\S*', '', review)
        review = re.sub(r'\S*\d\S*', '', review)
        review = ' '.join(word.lower() for word in review.split() if word.lower() not in sw)
        preprocessed_reviews.append(review.strip())
    df['review']=preprocessed_reviews
    return df
train_dataset = preprocessing(train_dataset)

  0%|          | 0/243497 [00:00<?, ?it/s]

In [11]:
train_dataset.head()

Unnamed: 0,sentiment,review
0,positive,yıldır tık demedi
1,positive,yıldır kullanıyorum müthiş
2,positive,ürün bugün elime geçti fazla inceleme fırsatım...
3,positive,almaya karar verdim hemencecik geldi keyifle k...
4,positive,günlük kullanımınızı iyi karsılıyor kısaca mük...


Load the T5 model and tokenizer.T5 (Text-To-Text Transfer Transformer) is a natural language processing (NLP) model developed by Google AI

In [12]:
model_id = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_id)
# model = T5ForConditionalGeneration.from_pretrained(model_id, device_map="auto")

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Comments that are excessively lengthy are not suitable for T5Tokenizer.

In [13]:
mask = test_dataset['review'].str.len() < 300 
test_dataset = test_dataset.loc[mask]

In [14]:
mask = train_dataset['review'].str.len() < 300 
train_dataset = train_dataset.loc[mask]

Convert datasets to concatenate.

In [15]:
train_df = Dataset.from_pandas(train_dataset)
test_df = Dataset.from_pandas(test_dataset)

In [17]:
type(train_df)

datasets.arrow_dataset.Dataset

The model accepts tokens that have the same length. To achieve this, we will  identify the maximum token length and pad shorter tokens to match it

In [18]:
tokenized_inputs = concatenate_datasets([train_df, test_df]).map(lambda x: tokenizer(x["review"], truncation=True,max_length=1024,return_overflowing_tokens=True), batched=True, remove_columns=['review', 'sentiment'])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

tokenized_targets = concatenate_datasets([train_df, test_df]).map(lambda x: tokenizer(x["sentiment"], truncation=True,max_length=512), batched=True, remove_columns=['review', 'sentiment'])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

  0%|          | 0/285 [00:00<?, ?ba/s]

Max source length: 213


  0%|          | 0/285 [00:00<?, ?ba/s]

Max target length: 2


In [19]:
datasets = DatasetDict({"train": train_df, "test": test_df})

In [20]:
datasets

DatasetDict({
    train: Dataset({
        features: ['sentiment', 'review', '__index_level_0__'],
        num_rows: 220493
    })
    test: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'review', 'sentiment', '__index_level_0__'],
        num_rows: 64445
    })
})

All textual and sentiment data are tokenized to the maximum token length. Tokens shorter than this maximum length are padded. To enhance the model's performance, the sentiment data has been replaced with -100 to exclude the padded sections. In the review data, the non-padded sections that require consideration will be explicitly specified using an 'attention_mask'.

In [21]:
def preprocess_function(sample, padding="max_length"):
    inputs = [item for item in sample["review"]]

    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["sentiment"], max_length=max_target_length, padding=padding, truncation=True)
    
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = datasets.map(preprocess_function, batched=True, remove_columns=['review', 'sentiment'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

  0%|          | 0/221 [00:00<?, ?ba/s]

  0%|          | 0/65 [00:00<?, ?ba/s]

Keys of tokenized dataset: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels']


The pre-trained model 'google/flan-t5-base' has been loaded. This model is designed for text summarization, not sentiment analysis. If you input tokens directly into the model, it will only generate summaries, not perform sentiment labelling.

In [22]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id="google/flan-t5-base"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

To generate sentiment labels using the model originally designed for text summarization, fine-tuning of the pre-trained model is required.
Fine-tuning is performed using the training parameters specified in Seq2SeqTrainingArguments. The pre-trained model, after being re-trained with these parameters and a labeled training dataset, gains the capability to perform sentiment labeling instead of text summarization.

In [23]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,
    learning_rate=3e-4,

    num_train_epochs=2,
    logging_strategy="epoch", 
    evaluation_strategy="no",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=False,
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"]
)


In [24]:
trainer.train()




Step,Training Loss
13781,0.0635
27562,0.0433




TrainOutput(global_step=27562, training_loss=0.05343617521689736, metrics={'train_runtime': 19425.2732, 'train_samples_per_second': 22.702, 'train_steps_per_second': 1.419, 'total_flos': 1.2562357508022067e+17, 'train_loss': 0.05343617521689736, 'epoch': 2.0})

The model performs labeling on the test dataset.

In [25]:
samples_number = len(datasets["test"])
progress_bar = tqdm(range(samples_number))
predictions_list = []
for i in range(samples_number):
    text = datasets['test']['review'][i]
    if isinstance(text, str):
        inputs = tokenizer.encode_plus(text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions_list.append(prediction)
#         datasets["test"]['sentiment'][i] = prediction 
    else :
        predictions_list.append("neutral")
    progress_bar.update(1)

  0%|          | 0/64445 [00:00<?, ?it/s]

In [39]:
predictions_list[2155]

'positive'

The dataset is updated with sentiments.

In [40]:
df = pd.DataFrame({'review': datasets['test']['review'], 'sentiment': predictions_list})

In [43]:
df.head(20)

Unnamed: 0,review,sentiment
0,bayiler satar artık e burası türkiye,positive
1,fiyatlara bin ekleyin bayide bulamayacaksınız ...,positive
2,motor kasaya yakışmış,positive
3,tl eksik aracı almam neyse sene bekleyem alırım,positive
4,eskiden milyoner olmak vardı derlerdi nerden g...,positive
5,arkadaşlarla eytli arabası deyip takıldığımız ...,positive
6,eski türkiyede bayiye gittin mi araç satmak ba...,positive
7,fiyat listesi yerine stok listesi paylaşılsa f...,positive
8,yok sen onu gecicentoyotanin sitesinde gordugu...,positive
9,alinmaz arabalar fiyatlara kadar zam gelecek s...,positive


Save the dataset

In [44]:
df.to_csv('labelled_dataset_with_FLAN.csv')