In [21]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import string
import pandas as pd
import re
from sklearn.metrics import classification_report

In [22]:
!curl -L -o tone-detection.zip https://www.kaggle.com/api/v1/datasets/download/zeeshanshaik75/tone-detection
!unzip tone-detection.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 2376k  100 2376k    0     0  1388k      0  0:00:01  0:00:01 --:--:-- 3078k
Archive:  tone-detection.zip
  inflating: total_df.csv            


In [23]:
# Загрузка данных
df = pd.read_csv("total_df.csv")
texts = df["text"]  # Берем столбец с текстом
labels = df["label"]  # И соответствующие метки

# Предварительная обработка текста
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Приведение к нижнему регистру
    text = text.lower()
    # Удаление пунктуации
    text = re.sub(f'[{string.punctuation}]', '', text)
    # Удаление цифр
    text = re.sub(r'\d+', '', text)
    # Токенизация
    words = text.split()
    # Лемматизация и удаление стоп-слов
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Применяем предобработку к текстам
processed_texts = texts.apply(preprocess_text)

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(
    processed_texts, labels, test_size=0.2, random_state=42
)

# Создание пайплайна: векторизация + модель
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

# Обучение модели
pipeline.fit(X_train, y_train)
with open("model.pkl", "wb") as f:
    pickle.dump(pipeline, f)
# Оценка модели
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Пример предсказания
sample_text = "This product is absolutely amazing, I love it!"
processed_sample = preprocess_text(sample_text)
prediction = pipeline.predict([processed_sample])
print(f"Prediction for sample text: {prediction[0]}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/aleksei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/aleksei/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                precision    recall  f1-score   support

      admiring       0.61      0.55      0.58      1200
     amusement       0.80      0.73      0.76       564
         anger       0.56      0.30      0.39       384
     annoyance       0.40      0.11      0.18       524
      approval       0.44      0.14      0.21       720
        caring       0.49      0.13      0.21       256
     concerned       0.52      0.11      0.18       458
     confident       0.43      0.14      0.21       232
     confusion       0.53      0.14      0.22       309
        direct       0.00      0.00      0.00        57
disappointment       0.31      0.04      0.07       268
  disapproving       0.28      0.07      0.11       408
       disgust       0.64      0.27      0.38       147
        formal       0.50      0.04      0.07        80
      friendly       0.00      0.00      0.00        65
     gratitude       0.81      0.81      0.81       534
      informal       1.00      0.01      0.02  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
sample_text = "i miss you"
processed_sample = preprocess_text(sample_text)
prediction = pipeline.predict([processed_sample])
print(f"Prediction for sample text: {prediction[0]}")

Prediction for sample text: sadness
