In [1]:
import pandas as pd
import numpy as np

drive_path = './'

# Загрузка
ratings_df = pd.read_csv(f'{drive_path}/ratings.csv')
books_df = pd.read_csv(f'{drive_path}/books.csv')
tags_df = pd.read_csv(f'{drive_path}/tags.csv')
book_tags_df = pd.read_csv(f'{drive_path}/book_tags.csv')

print("="*80)
print("ДИАГНОСТИКА ИСХОДНЫХ ДАННЫХ")
print("="*80)

# RATINGS_DF
print("\n[1] RATINGS_DF")
print(f"Строк: {len(ratings_df):,}")
print(f"Колонки: {ratings_df.columns.tolist()}")
print(f"Типы: {ratings_df.dtypes.to_dict()}")
print(f"Пропуски:\n{ratings_df.isna().sum()}")
print(f"Дубликаты (user_id, book_id): {ratings_df.duplicated(subset=['user_id', 'book_id']).sum()}")
print(f"Уникальных user_id: {ratings_df['user_id'].nunique()}")
print(f"Уникальных book_id: {ratings_df['book_id'].nunique()}")
print(f"Диапазон book_id: {ratings_df['book_id'].min()} - {ratings_df['book_id'].max()}")
print(f"Диапазон rating: {ratings_df['rating'].min()} - {ratings_df['rating'].max()}")
print(f"Дробные рейтинги: {(ratings_df['rating'] % 1 != 0).sum()}")
print(f"Распределение rating:\n{ratings_df['rating'].value_counts().sort_index()}")

# BOOKS_DF
print("\n[2] BOOKS_DF")
print(f"Строк: {len(books_df):,}")
print(f"Колонки: {books_df.columns.tolist()}")
print(f"Пропуски:\n{books_df.isna().sum()}")
print(f"Уникальных id: {books_df['id'].nunique()}")
print(f"Уникальных book_id: {books_df['book_id'].nunique()}")
print(f"Диапазон id: {books_df['id'].min()} - {books_df['id'].max()}")
print(f"Диапазон book_id: {books_df['book_id'].min()} - {books_df['book_id'].max()}")
print(f"Годы < 0: {(books_df['original_publication_year'] < 0).sum()}")
if (books_df['original_publication_year'] < 0).sum() > 0:
    print(f"Примеры книг с годом < 0:")
    print(books_df[books_df['original_publication_year'] < 0][['id', 'title', 'authors', 'original_publication_year']])

# TAGS_DF
print("\n[3] TAGS_DF")
print(f"Строк: {len(tags_df):,}")
print(f"Колонки: {tags_df.columns.tolist()}")
print(f"Пропуски:\n{tags_df.isna().sum()}")
print(f"Уникальных tag_id: {tags_df['tag_id'].nunique()}")
print(f"Уникальных tag_name: {tags_df['tag_name'].nunique()}")
print(f"Топ-10 тегов по частоте:\n{tags_df['tag_name'].value_counts().head(10)}")

# BOOK_TAGS_DF
print("\n[4] BOOK_TAGS_DF")
print(f"Строк: {len(book_tags_df):,}")
print(f"Колонки: {book_tags_df.columns.tolist()}")
print(f"Пропуски:\n{book_tags_df.isna().sum()}")
print(f"Уникальных goodreads_book_id: {book_tags_df['goodreads_book_id'].nunique()}")
print(f"Уникальных tag_id: {book_tags_df['tag_id'].nunique()}")
print(f"Диапазон goodreads_book_id: {book_tags_df['goodreads_book_id'].min()} - {book_tags_df['goodreads_book_id'].max()}")
print(f"Отрицательные count: {(book_tags_df['count'] < 0).sum()}")

# КРИТИЧЕСКИЕ ПРОВЕРКИ
print("\n[5] СВЯЗИ МЕЖДУ ДАТАСЕТАМИ")
print(f"ratings['book_id'] ∩ books['id']: {len(set(ratings_df['book_id']) & set(books_df['id']))}")
print(f"ratings['book_id'] ∩ books['book_id']: {len(set(ratings_df['book_id']) & set(books_df['book_id']))}")
print(f"book_tags['goodreads_book_id'] ∩ books['id']: {len(set(book_tags_df['goodreads_book_id']) & set(books_df['id']))}")
print(f"book_tags['goodreads_book_id'] ∩ books['book_id']: {len(set(book_tags_df['goodreads_book_id']) & set(books_df['book_id']))}")
print(f"book_tags['tag_id'] ∩ tags['tag_id']: {len(set(book_tags_df['tag_id']) & set(tags_df['tag_id']))}")

print("\n" + "="*80)
print("ДИАГНОСТИКА ЗАВЕРШЕНА")
print("="*80)


ДИАГНОСТИКА ИСХОДНЫХ ДАННЫХ

[1] RATINGS_DF
Строк: 981,756
Колонки: ['book_id', 'user_id', 'rating']
Типы: {'book_id': dtype('int64'), 'user_id': dtype('int64'), 'rating': dtype('int64')}
Пропуски:
book_id    0
user_id    0
rating     0
dtype: int64
Дубликаты (user_id, book_id): 2278
Уникальных user_id: 53424
Уникальных book_id: 10000
Диапазон book_id: 1 - 10000
Диапазон rating: 1 - 5
Дробные рейтинги: 0
Распределение rating:
rating
1     19575
2     63231
3    248623
4    357366
5    292961
Name: count, dtype: int64

[2] BOOKS_DF
Строк: 10,000
Колонки: ['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year', 'original_title', 'title', 'language_code', 'average_rating', 'ratings_count', 'work_ratings_count', 'work_text_reviews_count', 'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url', 'small_image_url']
Пропуски:
id                              0
book_id                         0
best_book_id       