####  Для выполнения задания будем  использовать датасет IMDB с комментариями к фильмам
скаченный датасет помещаем в директорию проекта

In [1]:
import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
imdb_df = pd.read_csv('IMDB_Dataset.csv')
imdb_df.head

<bound method NDFrame.head of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [6]:
# опедееляем функцию и паборы слов для разметки
def rule_based_tagging(text):
    positive_keywords = ['good', 'great', 'excellent', 'fantastic', 'amazing']
    negative_keywords = ['bad', 'terrible', 'awful', 'worst', 'poor']
    
    text = text.lower()
    if any(word in text for word in positive_keywords):
        return 'positive'
    elif any(word in text for word in negative_keywords):
        return 'negative'
    else:
        return 'neutral'

# Применяем правила основанной разметки к датасету
imdb_df['rule_based_tag'] = imdb_df['review'].apply(rule_based_tagging)

# Посмотрим результат
imdb_df.head()

Unnamed: 0,review,sentiment,rule_based_tag
0,One of the other reviewers has mentioned that ...,positive,neutral
1,A wonderful little production. <br /><br />The...,positive,positive
2,I thought this was a wonderful way to spend ti...,positive,positive
3,Basically there's a family where a little boy ...,negative,neutral
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,positive


In [7]:
# Применяем токенизацию к подмножеству данных
imdb_df['tokens'] = imdb_df['review'].apply(word_tokenize)

# Просмотрим результат
imdb_df.head()


Unnamed: 0,review,sentiment,rule_based_tag,tokens
0,One of the other reviewers has mentioned that ...,positive,neutral,"[One, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,positive,"[A, wonderful, little, production, ., <, br, /..."
2,I thought this was a wonderful way to spend ti...,positive,positive,"[I, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,neutral,"[Basically, there, 's, a, family, where, a, li..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,positive,"[Petter, Mattei, 's, ``, Love, in, the, Time, ..."


##### Совмещение ручной разметки данных с данными, размеченными по правилам


In [8]:
# Предполагая, что manual_tagged_data - другой DataFrame с отзывами размеченными руками
# Для примера сейчас будем использовать данные, размеченные по правилам
manual_tagged_data = imdb_df.copy()  # Для демонстрации будем использовать imdb_df
combined_data = pd.concat([manual_tagged_data[['review', 'rule_based_tag']], imdb_df[['review', 'rule_based_tag']]])

# Просмотрим результат
combined_data.head()


Unnamed: 0,review,rule_based_tag
0,One of the other reviewers has mentioned that ...,neutral
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,neutral
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


##### Обучение модели на объединенном размеченном датасете


In [9]:
# Разделение данных на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(combined_data['review'], combined_data['rule_based_tag'], test_size=0.2, random_state=42)

# Создание конвейера с векторизатором и классификатором
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Обучение модели
model.fit(X_train, y_train)


#### Оценка эффективности обученной модели на тестовой выборке

In [14]:
# Создание прогнозов для тестового набора
y_pred = model.predict(X_test)

# Расчет точности
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.80805


In [15]:
# Генерация отчета о классификации
report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{report}")

Classification Report:
              precision    recall  f1-score   support

    negative       0.63      0.74      0.68      3056
     neutral       0.92      0.52      0.66      4954
    positive       0.83      0.95      0.88     11990

    accuracy                           0.81     20000
   macro avg       0.79      0.73      0.74     20000
weighted avg       0.82      0.81      0.80     20000


In [17]:
# Генерация матрицы ошибок
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}")

Confusion Matrix:
[[ 2265    63   728]
 [  828  2555  1571]
 [  475   174 11341]]


In [None]:
##### 