In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import pandas as pd

In [3]:
df = pd.read_csv('data.csv')

In [4]:
# Загрузка списка стоп-слов и инициализация лемматизатора
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Функция для предварительной обработки текста
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

In [6]:
!pip install nltk.downloader 'omw-1.4'

ERROR: Invalid requirement: "'omw-1.4'"


In [7]:
# Применение предварительной обработки к текстам
tqdm.pandas()  # Используем tqdm для отслеживания прогресса
df['processed_text'] = df['summary'] + ' ' + df['genre']
df['processed_text'] = df['processed_text'].progress_apply(preprocess_text)

100%|██████████| 4657/4657 [00:11<00:00, 397.88it/s]


In [8]:
# Создание TF-IDF матрицы
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])

In [9]:
# Обучение модели Nearest Neighbors
model = NearestNeighbors(metric='cosine')
model.fit(tfidf_matrix)

In [10]:
def recommend_book(summary, genre, num_books=5):
    genres = []
    input_text = summary + ' ' + genre
    input_vector = tfidf_vectorizer.transform([preprocess_text(input_text)])

    # Поиск ближайших соседей для введенных данных
    distances, indices = model.kneighbors(input_vector, n_neighbors=num_books+1)

    # Вывод рекомендованных книг на основе близости к введенным данным
    for idx in indices.flatten()[1:]:
        genres.append(f"Book: {df['title'][idx]}")

    return genres

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [40]:
# Разделение данных на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df['genre'], test_size=0.2, random_state=42)

# Обучение модели Nearest Neighbors
model.fit(X_train, y_train)

# Получение ближайших соседей для тестового набора
neighbors = model.kneighbors(X_test, n_neighbors=1, return_distance=False)

# Прогнозирование меток на основе ближайших соседей
y_pred = [y_train.iloc[neighbor[0]] for neighbor in neighbors]

# Вычисление accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Вычисление f1
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score:", f1)

Accuracy: 0.5246781115879828
F1 score: 0.5279303693390376
