In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("/kaggle/input/kaspi-reviews-kz/kazakh_reviews.csv")

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(train_df.columns)

print("Train samples:", len(train_df))
print(train_df.head())

texts = train_df['combined_text'].astype(str).tolist()

print("Example text:", texts[0])

Index(['Unnamed: 0.1', 'Unnamed: 0', 'language', 'rating', 'category',
       'combined_text', 'sentiment'],
      dtype='object')
Train samples: 4729
      Unnamed: 0.1  Unnamed: 0 language  rating     category  \
339           5815        5815   kazakh     5.0  smartphones   
3800         65443       65443   kazakh     5.0       beauty   
1744         27827       27827   kazakh     5.0  smartphones   
925          12313       12313   kazakh     5.0  smartphones   
6703        116794      116794   kazakh     5.0   headphones   

                                          combined_text sentiment  
339                                    күшті телефон \n  positive  
3800                                         ұнады . \n  positive  
1744  суретка тұнық түсіреді ,  динамикас жақсы . да...  positive  
925   керемет екен . мена өте ризамын ,  шын айтамын...  positive  
6703  бәрі өте жақсы жұмыс істеп тұр . airpods керек...  positive  
Example text: күшті телефон 



In [2]:
import pandas as pd
import re

df = train_df[['combined_text', 'rating']].rename(columns={'combined_text': 'text'})

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zа-яәіқңғүұһёө\s.,!?-]", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['text'] = df['text'].apply(clean_text)

print(df.head())

print("Total processed texts:", len(df))

                                                   text  rating
339                                       күшті телефон     5.0
3800                                            ұнады .     5.0
1744  суретка тұнық түсіреді , динамикас жақсы . дау...     5.0
925   керемет екен . мена өте ризамын , шын айтамын ...     5.0
6703  бәрі өте жақсы жұмыс істеп тұр . airpods керек...     5.0
Total processed texts: 4729


In [3]:
def map_sentiment(score):
    if score < 2.0:
        return "negative"
    elif score < 4.0:
        return "neutral"
    else:
        return "positive"

df['label'] = df['rating'].apply(map_sentiment)

print(df.head())

print(df['label'].value_counts())

                                                   text  rating     label
339                                       күшті телефон     5.0  positive
3800                                            ұнады .     5.0  positive
1744  суретка тұнық түсіреді , динамикас жақсы . дау...     5.0  positive
925   керемет екен . мена өте ризамын , шын айтамын ...     5.0  positive
6703  бәрі өте жақсы жұмыс істеп тұр . airpods керек...     5.0  positive
label
positive    4446
neutral      172
negative     111
Name: count, dtype: int64


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    truncation=True,
    max_length=128
)

texts = df['text'].tolist()

results = sentiment_pipeline(texts, batch_size=32)

df['model_sentiment'] = [r['label'].lower() for r in results]

print(df.head())
print(df['model_sentiment'].value_counts())

2025-12-03 10:17:36.963314: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764757057.341914      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764757057.451519      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Device set to use cuda:0


                                                   text  rating     label  \
339                                       күшті телефон     5.0  positive   
3800                                            ұнады .     5.0  positive   
1744  суретка тұнық түсіреді , динамикас жақсы . дау...     5.0  positive   
925   керемет екен . мена өте ризамын , шын айтамын ...     5.0  positive   
6703  бәрі өте жақсы жұмыс істеп тұр . airpods керек...     5.0  positive   

     model_sentiment  
339          neutral  
3800        positive  
1744        positive  
925         positive  
6703        positive  
model_sentiment
positive    3707
neutral      745
negative     277
Name: count, dtype: int64


In [5]:
import pandas as pd

assert 'model_sentiment' in df.columns, "Run Block 5 first."

model_counts = df['model_sentiment'].value_counts()
model_percentages = round((model_counts / len(df)) * 100, 2)

model_summary = pd.DataFrame({
    "count": model_counts,
    "percentage": model_percentages
}).sort_index()

print("=== Model Sentiment Distribution ===")
print(model_summary)
print("\nTotal samples:", len(df))


=== Model Sentiment Distribution ===
                 count  percentage
model_sentiment                   
negative           277        5.86
neutral            745       15.75
positive          3707       78.39

Total samples: 4729


In [6]:
print(train_df.columns)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'language', 'rating', 'category',
       'combined_text', 'sentiment'],
      dtype='object')


In [7]:

df2 = train_df[['combined_text', 'rating']].rename(columns={'sentence1_kk': 'text'})
def map_sentiment(score):
    if score < 2.0:
        return "negative"
    elif score < 4.0:
        return "neutral"
    else:
        return "positive"

df2['label'] = df2['rating'].apply(map_sentiment)

print(df2.head())

                                          combined_text  rating     label
339                                    күшті телефон \n     5.0  positive
3800                                         ұнады . \n     5.0  positive
1744  суретка тұнық түсіреді ,  динамикас жақсы . да...     5.0  positive
925   керемет екен . мена өте ризамын ,  шын айтамын...     5.0  positive
6703  бәрі өте жақсы жұмыс істеп тұр . airpods керек...     5.0  positive


In [8]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report

assert 'label' in df.columns, "Run Block 3 first."
assert 'model_sentiment' in df.columns, "Run Block 5 first."

labels = ["negative", "neutral", "positive"]
cm = confusion_matrix(df['label'], df['model_sentiment'], labels=labels)

cm_df = pd.DataFrame(cm, index=[f"true_{l}" for l in labels],
                        columns=[f"pred_{l}" for l in labels])

print("=== Confusion Matrix ===")
print(cm_df)

print("\n=== Agreement Report ===")
print(classification_report(df['label'], df['model_sentiment'], labels=labels))

accuracy = (df['label'] == df['model_sentiment']).mean() * 100
print(f"\nOverall agreement: {accuracy:.2f}%")


=== Confusion Matrix ===
               pred_negative  pred_neutral  pred_positive
true_negative             64            37             10
true_neutral              47            66             59
true_positive            166           642           3638

=== Agreement Report ===
              precision    recall  f1-score   support

    negative       0.23      0.58      0.33       111
     neutral       0.09      0.38      0.14       172
    positive       0.98      0.82      0.89      4446

    accuracy                           0.80      4729
   macro avg       0.43      0.59      0.46      4729
weighted avg       0.93      0.80      0.85      4729


Overall agreement: 79.68%
