In [1]:
# Скачивание датасета
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2024-10-16 12:44:57--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2024-10-16 12:45:40 (1.90 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [2]:
# Распаковка архива
!tar -xf aclImdb_v1.tar.gz

In [3]:
# Проверка содержимого директории
!ls aclImdb

imdbEr.txt  imdb.vocab	README	test  train


In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [5]:
# Функция для загрузки данных
def load_data(directory):
    texts = []
    labels = []

    # Загрузка положительных отзывов
    pos_dir = os.path.join(directory, 'pos')
    for file_name in os.listdir(pos_dir):
        file_path = os.path.join(pos_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            texts.append(f.read())
            labels.append(1)  # Метка для положительных отзывов

    # Загрузка отрицательных отзывов
    neg_dir = os.path.join(directory, 'neg')
    for file_name in os.listdir(neg_dir):
        file_path = os.path.join(neg_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            texts.append(f.read())
            labels.append(0)  # Метка для отрицательных отзывов

    return texts, labels

In [6]:
# Загрузка данных
train_dir = 'aclImdb/train'
test_dir = 'aclImdb/test'

X_train_texts, y_train = load_data(train_dir)
X_test_texts, y_test = load_data(test_dir)

In [7]:
# Преобразование текста в числовое представление с помощью TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')

# Преобразование обучающих и тестовых данных
X_train = tfidf.fit_transform(X_train_texts)
X_test = tfidf.transform(X_test_texts)

In [8]:
# Обучение модели логистической регрессии
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Прогнозирование на тестовых данных
y_pred = model.predict(X_test)

In [9]:
# Оценка модели
accuracy = accuracy_score(y_test, y_pred)
print(f"Точность модели: {accuracy * 100:.2f}%")

Точность модели: 87.88%


In [10]:
# Отчет о классификации
print(classification_report(y_test, y_pred, target_names=["Отрицательные", "Положительные"]))

               precision    recall  f1-score   support

Отрицательные       0.88      0.87      0.88     12500
Положительные       0.88      0.88      0.88     12500

     accuracy                           0.88     25000
    macro avg       0.88      0.88      0.88     25000
 weighted avg       0.88      0.88      0.88     25000



In [11]:
import joblib

# Сохранение модели
joblib.dump(model, 'logistic_model.pkl')

['logistic_model.pkl']

In [18]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Пример создания и сохранения TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')

X_train_tfidf = tfidf.fit_transform(X_train_texts)

# Сохранение модели
joblib.dump(tfidf, 'tfidf.pkl')


['tfidf.pkl']