# Importing all necessary packages

In [1]:
# Data Processing
import pandas as pd
from scipy.sparse import csr_matrix

# Importing relevant data

"items_info.dat" contains data field seperated by "\t" but some of the fields also contain "\t" within the text of the file. This causes errors when the file is read directly. The next cell is trying to sort that error.

In [21]:
# This code block creates a new .dat file that contains only the first 6 columns and the last column (the remaining columns are not necessary for the project). 
with open('book_crossing/book_crossing/items_info.dat', 'r', encoding='utf-8') as infile, \
     open('book_crossing/book_crossing/items_info_clean.dat', 'w', encoding='utf-8') as outfile:
    for line in infile:
        parts = line.strip().split('\t')
        if len(parts) >= 6:
            selected_parts = parts[:6] + [parts[-1]]  # First 6 fields + last field
            outfile.write('\t'.join(selected_parts) + '\n')

In [22]:
ratings = pd.read_csv('book_crossing/book_crossing/book_ratings.dat', delimiter = '\t')
history = pd.read_csv('book_crossing/book_crossing/book_history.dat', delimiter = '\t') # Books and readers history.
items = pd.read_csv('book_crossing/book_crossing/items_info_clean.dat', delimiter = '\t', on_bad_lines='skip')  # list of books and Ids (Primary id key)
users = pd.read_csv('book_crossing/book_crossing/users_info.dat', delimiter = '\t') # list of users

# Analysis

## Analysis of ratings dataset

In [23]:
# Total number of readers
n_readers = ratings['user'].nunique()
# Total number of books
n_books = ratings['item'].nunique()

## Book Average ratings

In [14]:
avg_ratings = ratings.groupby('item')['rating'].mean()
avg_ratings_dict = dict(zip(avg_ratings.index, round(avg_ratings, 2)))

## Book titles

In [26]:
book_titles = dict(zip(items['Book_ID'], items['Book-Title']))

In [29]:
# What percent of books accessed were rated
percent_rated = len(ratings) / len(history) * 100
print(f'{round(percent_rated, 2)}% of the books accessed were rated.')
# What user accessed the most books
user_access = history.groupby('user')['item'].count().sort_values(ascending=False)
print(f'The user who accessed the most books is user {user_access.index[0]} with {user_access.max()} books accessed.')
# What user rated the most books
user_ratings = ratings.groupby('user')['item'].count().sort_values(ascending=False)
print(f'The user who rated the most books is user {user_ratings.index[0]} with {user_ratings.max()} books rated.')
# What was the most accessed books
book_access = history.value_counts('item')
print(f'The most accessed book was {book_titles[book_access.index[0]]}, accessed {book_access.max()} times.')
# What was the most rated books?
book_ratings = ratings.value_counts('item')
print(f'The most rated book was {book_titles[book_ratings.index[0]]}, rated {book_ratings.max()} times.')

22.98% of the books accessed were rated.
The user who accessed the most books is user 1614 with 2088 books accessed.
The user who rated the most books is user 1003 with 1092 books rated.
The most accessed book was Airframe, accessed 718 times.
The most rated book was Impossible Vacation, rated 160 times.


# Creating Sparse Matrix