In [2]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from rank_bm25 import BM25Okapi
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import nltk

In [3]:
# Загрузка данных
data = pd.read_csv("output.csv", sep='\t')

# Разделение данных на обучающую и тестовую выборки
train_data, test_data = train_test_split(data, test_size=0.1, random_state=64)

# Предобработка текста
stop_words = stopwords.words('russian')
stemmer = SnowballStemmer('russian')

In [None]:
def preprocess_text(text):
    text = str(text).lower()
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

train_data['Processed_Text'] = train_data['Text'].apply(preprocess_text)
test_data['Processed_Text'] = test_data['Text'].apply(preprocess_text)

# Векторизация текста с использованием BM25
bm25 = BM25Okapi(train_data['Processed_Text'].tolist())

train_vectors = [bm25.get_scores(text) for text in train_data['Processed_Text'].tolist()]

test_vectors = [bm25.get_scores(text) for text in test_data['Processed_Text'].tolist()]


In [None]:
# Преобразуем вектора в numpy массивы для использования в модели
train_vectors = np.array(train_vectors)

model = LogisticRegression()
model.fit(train_vectors, train_data['Score'])

# Предсказания на тестовой выборке
predictions = model.predict(test_vectors)

# Оценка модели
accuracy = accuracy_score(test_data['Score'], predictions)
print(f"Accuracy: {accuracy}")

In [1]:
# Сохранение модели
joblib.dump(model, 'sentiment_model.pkl')

# Загрузка модели (при необходимости)
# model = joblib.load('sentiment_model.pkl')

NameError: name 'pd' is not defined

In [7]:
# Интерпретация результатов
# Вывод наиболее важных признаков (слов) для каждого класса
coefs = model.coef_[0]
sorted_coefs = sorted(zip(coefs, bm25.idf), key=lambda x: abs(x[0]), reverse=True)

# Вывод топ-10 положительных и отрицательных слов
print("Top 10 positive words:")
for coef, idf in sorted_coefs[:10]:
    if coef > 0:
        print(f"IDF: {idf}, Coefficient: {coef}")

print("\nTop 10 negative words:")
for coef, idf in sorted_coefs[:10]:
    if coef < 0:
        print(f"IDF: {idf}, Coefficient: {coef}")

# Визуализация важных признаков
top_positive_coefs = [coef for coef, idf in sorted_coefs[:10] if coef > 0]
top_negative_coefs = [coef for coef, idf in sorted_coefs[:10] if coef < 0]

Map:   0%|          | 0/200142 [00:00<?, ? examples/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 200142/200142 [00:32<00:00, 6187.62 examples/s]
Map: 100%|██████████| 22238/22238 [00:03<00:00, 5992.50 examples/s]
Map: 100%|██████████| 55596/55596 [00:09<00:00, 5949.17 examples/s]


In [8]:
plt.figure(figsize=(12, 6))
plt.bar(range(len(top_positive_coefs)), top_positive_coefs, color='green', label="Positive")
plt.bar(range(len(top_negative_coefs)), top_negative_coefs, color='red', label="Negative")
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 Positive and Negative Words")
plt.xlabel("Words")
plt.ylabel("Coefficient")
plt.legend()
plt.show()

# Анализ ошибок
incorrect_predictions = test_data[test_data['Score'] != predictions]
print("\nExamples of incorrect predictions:")
print(incorrect_predictions[['Text', 'Score']].head(10))

from sklearn.metrics import confusion_matrix

# Confusion matrix
cm = confusion_matrix(test_data['Score'], predictions)
print(f"Confusion Matrix:\n{cm}")

# Визуализация confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
            xticklabels=['Negative', 'Positive'], 
            yticklabels=['Negative', 'Positive'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

Map: 100%|██████████| 200142/200142 [00:34<00:00, 5820.82 examples/s]
Map: 100%|██████████| 22238/22238 [00:03<00:00, 5661.00 examples/s]
Map: 100%|██████████| 55596/55596 [00:09<00:00, 5628.58 examples/s]


In [10]:
pip install torch torchvision torchaudio

Collecting torch
  Downloading torch-2.4.0-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting torchvision
  Downloading torchvision-0.19.0-1-cp312-cp312-win_amd64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.4.0-cp312-cp312-win_amd64.whl.metadata (6.4 kB)
Collecting sympy (from torch)
  Downloading sympy-1.13.2-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.4.0-cp312-cp312-win_amd64.whl (197.8 MB)
   ---------------------------------------- 0.0/197.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/197.8 MB 7.5 MB/s eta 0:00:27
   ---------------------------------------- 0.5/197.8 MB 5.9 MB/s eta 0:00:34
   ---------------------------------------- 0.8/197.8 MB 7.7 MB/s eta 0:00:26
   ---------------------------------------- 1.4/197

In [11]:
# Создание модели
from transformers import TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Определение метрик
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Аргументы для обучения
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Создание Trainer объекта
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Обучение модели
trainer.train()


OSError: [WinError 126] Не найден указанный модуль. Error loading "C:\Users\ADMIN\miniconda3\Lib\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.

In [None]:
# Оценка модели на тестовой выборке
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Вычисление метрик
accuracy = accuracy_score(test_data["Score"].map({"Positive": 1, "Negative": 0}), predicted_labels)
precision = precision_score(test_data["Score"].map({"Positive": 1, "Negative": 0}), predicted_labels)
recall = recall_score(test_data["Score"].map({"Positive": 1, "Negative": 0}), predicted_labels)
f1 = f1_score(test_data["Score"].map({"Positive": 1, "Negative": 0}), predicted_labels)
cm = confusion_matrix(test_data["Score"].map({"Positive": 1, "Negative": 0}), predicted_labels)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Confusion Matrix:\n {cm}")


In [None]:

# Функция для предсказания тональности
def predict_sentiment(text):
    inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    predicted_class = np.argmax(outputs.logits.detach().numpy())
    probability = np.exp(outputs.logits.detach().numpy()) / np.sum(np.exp(outputs.logits.detach().numpy()))
    sentiment = "Positive" if predicted_class == 1 else "Negative"
    return sentiment, probability

# Пример использования
text = "Отличный банк, всем рекомендую!"
sentiment, probability = predict_sentiment(text)
print(f"Sentiment: {sentiment}, Probability: {probability}")