<a href="https://colab.research.google.com/github/CodeHunterOfficial/ABC_DataMining/blob/main/NM/14-02-2025-Recomended.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Установка необходимых библиотек
!pip install transformers datasets sentence-transformers scikit-learn gensim bert-score umap-learn matplotlib faiss-cpu annoy pot streamlit gensim



In [9]:
!pip install --upgrade transformers torch

Collecting transformers
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cusparselt-cu12==0.6.2 (from torch)
  Downloading nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting triton==3.2.0 (from torch)
  Downloading triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading transformers-4.48.3-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl (766.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cusparse

In [11]:
# Импорт библиотек

# Gensim
from gensim.models import KeyedVectors, Word2Vec
from gensim.similarities import WmdSimilarity
import gensim.downloader as api

# Hugging Face Transformers
from transformers import AutoTokenizer, AutoModel

# Datasets
from datasets import load_dataset

# Sentence Transformers
from sentence_transformers import SentenceTransformer

# Scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ndcg_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import IsolationForest
from scipy.spatial.distance import cdist

# BERT-Score
from bert_score import score

# Dimensionality Reduction
import umap

# Indexing and Similarity Search
import faiss
from annoy import AnnoyIndex

# PyTorch
import torch

# Data Manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Logging and Serialization
import logging
import pickle

# Streamlit for Web Applications
import streamlit as st

# Unit Testing
import unittest

# Профилирование
import cProfile

# Настройка логирования
logging.basicConfig(filename='recommendations.log', level=logging.INFO)

# Загрузка модели Word2Vec
word2vec_model = api.load("glove-wiki-gigaword-100")

# Шаг 1: Загрузка датасета Amazon Reviews
logging.info("Загрузка датасета...")
url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz"
import os
if not os.path.exists("reviews_Electronics_5.json"):
    os.system(f"wget {url}")
    os.system("gzip -d reviews_Electronics_5.json.gz")

data = pd.read_json("reviews_Electronics_5.json", lines=True)

# Выбор первых 1000 записей для тестирования
reviews = data['reviewText'].tolist()[:50]
product_titles = data['reviewerName'].tolist()[:50]
ratings = data['overall'].tolist()[:50]

# Предобработка данных
filtered_reviews = [review for review in reviews if isinstance(review, str) and review.strip()]
filtered_titles = [title for i, title in enumerate(product_titles) if isinstance(title, str) and reviews[i].strip()]
filtered_ratings = [rating for i, rating in enumerate(ratings) if isinstance(rating, (int, float)) and reviews[i].strip()]

# Шаг 2: Генерация эмбеддингов с помощью Hugging Face
logging.info("Генерация эмбеддингов...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # [CLS] токен
    return embeddings

embeddings = get_embeddings(filtered_reviews)

# Сохранение эмбеддингов
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

# Функция для получения рекомендаций
def get_recommendations(selected_index, top_n=5):
    """
    Функция для получения рекомендаций на основе разных подходов.
    :param selected_index: Индекс выбранного товара.
    :param top_n: Количество рекомендаций.
    """
    print(f"Выбранный товар: {filtered_titles[selected_index]}")

    # Косинусная схожесть
    similarity_matrix = cosine_similarity([embeddings[selected_index]], embeddings)[0]
    most_similar_indices_cosine = np.argsort(similarity_matrix)[-top_n-1:-1][::-1]
    print("\nРекомендации по косинусной схожести:")
    for idx in most_similar_indices_cosine:
        print(f"- {filtered_titles[idx]} (Схожесть: {similarity_matrix[idx]:.2f})")

    # Евклидово расстояние
    distance_matrix_euclidean = cdist([embeddings[selected_index]], embeddings, metric='euclidean')[0]
    closest_indices_by_distance = np.argsort(distance_matrix_euclidean)[:top_n]
    print("\nРекомендации по Евклидову расстоянию:")
    for idx in closest_indices_by_distance:
        print(f"- {filtered_titles[idx]} (Расстояние: {distance_matrix_euclidean[idx]:.2f})")

    # Манхэттенское расстояние
    distance_matrix_cityblock = cdist([embeddings[selected_index]], embeddings, metric='cityblock')[0]
    closest_indices_by_cityblock = np.argsort(distance_matrix_cityblock)[:top_n]
    print("\nРекомендации по Манхэттенскому расстоянию:")
    for idx in closest_indices_by_cityblock:
        print(f"- {filtered_titles[idx]} (Расстояние: {distance_matrix_cityblock[idx]:.2f})")

    # Кластеризация K-Means
    kmeans = KMeans(n_clusters=5, random_state=42)
    clusters = kmeans.fit_predict(embeddings)
    selected_cluster = clusters[selected_index]
    cluster_items = [i for i, c in enumerate(clusters) if c == selected_cluster and i != selected_index]
    print("\nТовары из того же кластера (K-Means):")
    for idx in cluster_items[:top_n]:
        print(f"- {filtered_titles[idx]}")

    # Кластеризация DBSCAN
    dbscan = DBSCAN(eps=0.5, min_samples=2)
    clusters_dbscan = dbscan.fit_predict(embeddings)
    if clusters_dbscan[selected_index] != -1:
        selected_cluster_dbscan = clusters_dbscan[selected_index]
        cluster_items_dbscan = [i for i, c in enumerate(clusters_dbscan) if c == selected_cluster_dbscan and i != selected_index]
        print("\nТовары из того же кластера (DBSCAN):")
        for idx in cluster_items_dbscan[:top_n]:
            print(f"- {filtered_titles[idx]}")
    else:
        print("\nВыбранный товар является шумом (DBSCAN).")

    # Word Mover's Distance (WMD)
    print("\nРекомендации по Word Mover's Distance:")
    wmd_distances = []
    for i, review in enumerate(filtered_reviews):
        if i == selected_index:
            continue  # Пропускаем выбранный отзыв
        distance = word2vec_model.wmdistance(filtered_reviews[selected_index].split(), review.split())
        wmd_distances.append((i, distance))
    # Сортируем по расстоянию (чем меньше, тем лучше)
    wmd_distances.sort(key=lambda x: x[1])
    for idx, distance in wmd_distances[:top_n]:
        print(f"- {filtered_titles[idx]} (Расстояние: {distance:.2f})")

    # TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(filtered_reviews)
    tfidf_similarity_matrix = cosine_similarity(tfidf_matrix[selected_index], tfidf_matrix)[0]
    closest_indices_by_tfidf = np.argsort(tfidf_similarity_matrix)[-top_n-1:-1][::-1]
    print("\nРекомендации по TF-IDF:")
    for idx in closest_indices_by_tfidf:
        print(f"- {filtered_titles[idx]} (Схожесть: {tfidf_similarity_matrix[idx]:.2f})")

    # BERTScore
    P, R, F1 = score(
        [filtered_reviews[selected_index]] * len(filtered_reviews),
        filtered_reviews,
        lang="en",
        verbose=True
    )
    closest_indices_by_bertscore = np.argsort(F1.numpy())[-top_n-1:-1][::-1]
    print("\nРекомендации по BERTScore:")
    for idx in closest_indices_by_bertscore:
        print(f"- {filtered_titles[idx]} (Схожесть: {F1[idx]:.2f})")

    # FAISS
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 норма (расстояние Евклида)
    index.add(np.array(embeddings).astype('float32'))
    query_embedding = np.array([embeddings[selected_index]]).astype('float32')
    distances, indices = index.search(query_embedding, top_n + 1)  # +1 чтобы исключить сам элемент
    print("\nРекомендации по FAISS:")
    for i, idx in enumerate(indices[0][1:], start=1):  # Пропускаем первый элемент (сам себя)
        print(f"{i}. {filtered_titles[idx]} (Расстояние: {distances[0][i]:.2f})")

    # Annoy
    f = embeddings.shape[1]  # Размерность эмбеддингов
    t = AnnoyIndex(f, 'angular')  # Используем метрику angular для косинусной схожести
    for i, emb in enumerate(embeddings):
        t.add_item(i, emb)
    t.build(10)  # Построение индекса с 10 деревьями
    indices, distances = t.get_nns_by_item(selected_index, top_n + 1, include_distances=True)  # +1 чтобы исключить сам элемент
    print("\nРекомендации по Annoy:")
    for i, idx in enumerate(indices[1:], start=1):  # Пропускаем первый элемент (сам себя)
        print(f"{i}. {filtered_titles[idx]} (Схожесть: {1 - distances[i]:.2f})")  # 1 - distance для схожести

    # Ансамбль
    metrics = [
        cosine_similarity([embeddings[selected_index]], embeddings)[0],
        -cdist([embeddings[selected_index]], embeddings, metric='euclidean')[0],  # Отрицательное значение для схожести
        -cdist([embeddings[selected_index]], embeddings, metric='cityblock')[0]
    ]
    combined_scores = np.mean(metrics, axis=0)
    most_similar_indices_ensemble = np.argsort(combined_scores)[-top_n-1:-1][::-1]
    print("\nРекомендации по ансамблевому подходу:")
    for idx in most_similar_indices_ensemble:
        print(f"- {filtered_titles[idx]} (Средняя схожесть: {combined_scores[idx]:.2f})")

    # Комбинированная метрика
    P, R, F1 = score(
        [filtered_reviews[selected_index]] * len(filtered_reviews),
        filtered_reviews,
        lang="en",
        verbose=False
    )
    combined_metric = cosine_similarity([embeddings[selected_index]], embeddings)[0] + F1.numpy()
    most_similar_indices_combined = np.argsort(combined_metric)[-top_n-1:-1][::-1]
    print("\nРекомендации по комбинированной метрике:")
    for idx in most_similar_indices_combined:
        print(f"- {filtered_titles[idx]} (Суммарная схожесть: {combined_metric[idx]:.2f})")

    # Мультикритериальный подход
    similarity_scores = cosine_similarity([embeddings[selected_index]], embeddings)[0]
    rating_scores = np.array(filtered_ratings) / 5.0  # Нормализуем рейтинги
    combined_scores_multi = similarity_scores + rating_scores
    most_similar_indices_multi = np.argsort(combined_scores_multi)[-top_n-1:-1][::-1]
    print("\nРекомендации по мультикритериальному подходу:")
    for idx in most_similar_indices_multi:
        print(f"- {filtered_titles[idx]} (Суммарная оценка: {combined_scores_multi[idx]:.2f}, Рейтинг: {filtered_ratings[idx]}/5)")

    # Визуализация UMAP
    reducer = umap.UMAP(random_state=42)
    embedding_2d = reducer.fit_transform(embeddings)
    plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], s=5)
    plt.scatter(embedding_2d[selected_index, 0], embedding_2d[selected_index, 1], color='red', label='Выбранный товар')
    plt.legend()
    plt.title("Визуализация схожести (UMAP)")
    plt.show()

# Шаг 3: Тестирование рекомендательной системы
selected_index = 10  # Выберите индекс товара для анализа
get_recommendations(selected_index, top_n=5)

# Оценка качества рекомендаций
def evaluate_recommendations(true_indices, predicted_indices, k=5):
    # Пример оценки NDCG
    true_relevance = np.zeros(len(predicted_indices))
    true_relevance[true_indices] = 1  # Релевантные элементы
    ndcg = ndcg_score([true_relevance], [predicted_indices], k=k)
    return ndcg

# Пример использования
true_indices = [1, 3, 5]  # Пример релевантных индексов
predicted_indices = [1, 2, 3, 4, 5]  # Пример предсказанных индексов
ndcg_score = evaluate_recommendations(true_indices, predicted_indices)
print(f"NDCG Score: {ndcg_score}")

# Оптимизация гиперпараметров
param_grid = {'n_clusters': [3, 5, 7, 10]}
kmeans = KMeans(random_state=42)
grid_search = GridSearchCV(kmeans, param_grid, cv=3)
grid_search.fit(embeddings)
best_kmeans = grid_search.best_estimator_
print(f"Лучшие параметры для K-Means: {grid_search.best_params_}")

# Анализ выбросов
iso_forest = IsolationForest(contamination=0.1)
outliers = iso_forest.fit_predict(embeddings)
clean_embeddings = embeddings[outliers == 1]

# Интерпретация рекомендаций
def explain_recommendation(review_index, top_n=5):
    tfidf_scores = tfidf_matrix[review_index].toarray().flatten()
    top_indices = np.argsort(tfidf_scores)[-top_n:]
    top_features = [feature_names[i] for i in top_indices]
    return top_features

# Пример использования
top_features = explain_recommendation(selected_index)
print(f"Ключевые слова для рекомендации: {top_features}")

# Визуализация кластеров
reducer = umap.UMAP(random_state=42)
embedding_2d = reducer.fit_transform(embeddings)
plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], c=clusters, cmap='Spectral', s=5)
plt.colorbar()
plt.title("Визуализация кластеров (UMAP)")
plt.show()

# Streamlit интерфейс
st.title("Рекомендательная система")
selected_index = st.selectbox("Выберите товар", range(len(filtered_titles)))
if st.button("Получить рекомендации"):
    get_recommendations(selected_index, top_n=5)

# Логирование
logging.info(f"Рекомендации для товара {filtered_titles[selected_index]}")

# Профилирование
cProfile.run('get_recommendations(selected_index, top_n=5)')

# Unit Testing
class TestRecommendations(unittest.TestCase):
    def test_cosine_similarity(self):
        embeddings = np.array([[1, 0], [0, 1], [1, 1]])
        similarity_matrix = cosine_similarity(embeddings)
        self.assertAlmostEqual(similarity_matrix[0, 1], 0.0)
        self.assertAlmostEqual(similarity_matrix[0, 2], 0.707, places=3)

if __name__ == '__main__':
    unittest.main()

RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
cannot import name 'TensorifyScalarRestartAnalysis' from 'torch._dynamo.exc' (/usr/local/lib/python3.11/dist-packages/torch/_dynamo/exc.py)