In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
import pandas as pd
import re
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, ClassLabel, load_metric

# Veri setini yükleme ve temizleme
data_path = 'IMDB Dataset.csv'
imdb_data = pd.read_csv(data_path)
imdb_data['review'] = imdb_data['review'].apply(lambda x: re.sub(r'<br\s*/?>', ' ', x.lower()))

# Duygu analizi ile sınıflandırma
def get_sentiment(review):
    analysis = TextBlob(review)
    polarity = analysis.sentiment.polarity
    if polarity > 0.1:
        return 0  # positive
    elif polarity < -0.1:
        return 1  # negative
    else:
        return 2  # mixed

imdb_data['label'] = imdb_data['review'].apply(get_sentiment)


# Eğitim ve test veri setlerini ayırma
train_df, test_df = train_test_split(imdb_data, test_size=0.2, random_state=42)

# Hugging Face dataset nesnelerine dönüştürme
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
imdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
 2   label      50000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [14]:
imdb_data.to_csv("imdb_data_mixed", index= False)

In [6]:
imdb_data.label.value_counts()

label
0    25495
2    19697
1     4808
Name: count, dtype: int64

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-imdb")
model = AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-imdb", num_labels=3, ignore_mismatched_sizes=True)  # Üç sınıf

# Modelin çıkış katmanını yeni sınıflara uygun olarak yeniden başlatma
model.classifier = torch.nn.Linear(model.config.hidden_size, 3)
torch.nn.init.xavier_normal_(model.classifier.weight)

# Tokenizasyon fonksiyonu
def tokenize_function(examples):
    return tokenizer(examples['review'], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Eğitim metrikleri
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Metrik hesaplama
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return load_metric("accuracy").compute(predictions=predictions, references=labels)

# Trainer oluşturma
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Modeli eğitme
trainer.train()

# Modeli değerlendirme
results = trainer.evaluate()
print(results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at textattack/bert-base-uncased-imdb and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 40000/40000 [00:12<00:00, 3147.09 examples/s]
Map: 100%|██████████| 10000/10000 [00:03<00:00, 2948.18 examples/s]
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5609,0.582311,0.7562
2,0.4295,0.59429,0.7689
3,0.3373,0.805171,0.7661


  return load_metric("accuracy").compute(predictions=predictions, references=labels)
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.8051713705062866, 'eval_accuracy': 0.7661, 'eval_runtime': 193.1194, 'eval_samples_per_second': 51.781, 'eval_steps_per_second': 3.236, 'epoch': 3.0}


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


# Model Performance

In [16]:
import numpy as np
from sklearn.metrics import classification_report
# Model değerlendirme ve tahminlerin elde edilmesi
metrics = trainer.evaluate()
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Gerçek etiketler
true_labels = predictions.label_ids

# Sınıflandırma raporunun hazırlanması
report = classification_report(true_labels, predicted_labels, target_names=['positive', 'negative', 'mixed'])
print(report)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


              precision    recall  f1-score   support

    positive       0.83      0.83      0.83      5103
    negative       0.66      0.64      0.65       956
       mixed       0.70      0.71      0.71      3941

    accuracy                           0.77     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.77      0.77      0.77     10000



# Save Model

In [15]:
# Modeli ve tokenizer'ı kaydet
model_save_path = "./saved_model"
tokenizer_save_path = "./saved_model"

# Modeli kaydet
model.save_pretrained(model_save_path)

# Tokenizer'ı kaydet
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model ve tokenizer {model_save_path} dizinine kaydedildi.")

Model ve tokenizer ./saved_model dizinine kaydedildi.


# Load Model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_load_path = "./saved_model"
tokenizer_load_path = "./saved_model"

model = AutoModelForSequenceClassification.from_pretrained(model_load_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_load_path)