In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
from datasets import Dataset
import re
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Load your data
df = pd.read_csv("data.csv")  
df['text'] = df['text'].apply(lambda x: re.sub(r"[^a-zA-ZığüşöçİĞÜŞÖÇ ]","",x).lower())
# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # random_state for reproducibility

In [3]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased") # Türkçe BERT modelini kullan

def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

dataset = Dataset.from_pandas(df)
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels=len(set(df["label"]))) # Türkçe BERT modelini kullan


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

Map:   0%|          | 0/6243 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    num_train_epochs=5,  # Adjust as needed
    per_device_train_batch_size=8,  # Adjust based on your GPU memory
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)


In [None]:

# Define a function to compute metrics
def compute_metrics(pred: EvalPrediction):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets, # Using same dataset for eval in this example
    compute_metrics=compute_metrics, # Pass the metrics function to the Trainer
)

trainer.train()


  0%|          | 0/3905 [00:00<?, ?it/s]

{'loss': 0.411, 'grad_norm': 0.07530378550291061, 'learning_rate': 5e-05, 'epoch': 0.64}


In [7]:

# Save the model
trainer.save_model("./fine_tuned_bert0")

test_dataset = Dataset.from_pandas(test_df)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Evaluate on the test dataset
results = trainer.predict(tokenized_test_dataset)
predicted_labels = np.argmax(results.predictions, axis=1)
accuracy = accuracy_score(test_df["label"], predicted_labels)

print(f"Test Accuracy: {accuracy}")

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

  0%|          | 0/80 [00:00<?, ?it/s]

Test Accuracy: 0.9265625


In [8]:
diff_df = test_df
diff_df['predicted'] = predicted_labels
diff_df = diff_df[diff_df['label'] != diff_df['predicted']]
diff_df

Unnamed: 0,text,label,predicted
2895,the only reason why im giving a five star rat...,1,0
1289,another day and another review for vodaphone d...,0,1
1590,we have att fiber optics home internet we have...,0,1
564,i recently wrote a review of assurant the comp...,0,1
594,att has the worst customer service ive ever ex...,0,1
1446,my wife and i shared a grandfathered data plan...,0,1
1151,this is the second part of a review without go...,0,1
51,trying to interact with customer service is in...,0,1
63,well i had to hook up a line just to see if i ...,0,1
2440,im trying to figure why so many morans are giv...,1,0


In [9]:
diff_df.to_csv("false_predictions.csv",index=False)

In [10]:
df = test_df
df['predicted'] = predicted_labels

In [11]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(df['label'], df['predicted'])

# Confusion matrix'i daha okunaklı hale getirmek için bir DataFrame'e dönüştürün
cm_df = pd.DataFrame(cm, index=df['label'].unique(), columns=df['label'].unique())

# Confusion matrix'i görüntüleyin
cm_df

Unnamed: 0,1,0
1,312,31
0,16,281


In [12]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Sınıflandırma raporunu yazdır
print(classification_report(df['label'], df['predicted']))

# Her sınıf için metrikleri hesapla
accuracy = accuracy_score(df['label'], df['predicted'])
precision = precision_score(df['label'], df['predicted'], average=None)
recall = recall_score(df['label'], df['predicted'], average=None)
f1 = f1_score(df['label'], df['predicted'], average=None)

# Sonuçları yazdır
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

              precision    recall  f1-score   support

           0       0.95      0.91      0.93       343
           1       0.90      0.95      0.92       297

    accuracy                           0.93       640
   macro avg       0.93      0.93      0.93       640
weighted avg       0.93      0.93      0.93       640

Accuracy: 0.9265625
Precision: [0.95121951 0.90064103]
Recall: [0.90962099 0.94612795]
F1-score: [0.92995529 0.9228243 ]


In [19]:
model = AutoModelForSequenceClassification.from_pretrained("./fine_tuned_bert0")
def preprocess_text(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

import torch

def predict(text):
    inputs = preprocess_text(text)
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions.item()

# Example usage
text =""
prediction = predict(text)
print(f"Prediction: {prediction}")

Prediction: 1
