In [1]:
import numpy as np
import pandas as pd 


import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

**Загрузим данные**

In [2]:
books = pd.read_csv('dataset/books.csv', encoding = "ISO-8859-1")
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [3]:
books.shape

(10000, 23)

In [4]:
books.columns

Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
       'isbn13', 'authors', 'original_publication_year', 'original_title',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
       'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
       'small_image_url'],
      dtype='object')

In [5]:
ratings = pd.read_csv('dataset/ratings.csv', encoding = "ISO-8859-1")
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [6]:
book_tags = pd.read_csv('dataset/book_tags.csv', encoding = "ISO-8859-1")
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [7]:
tags = pd.read_csv('dataset/tags.csv')
tags.tail()

Unnamed: 0,tag_id,tag_name
34247,34247,Ｃhildrens
34248,34248,Ｆａｖｏｒｉｔｅｓ
34249,34249,Ｍａｎｇａ
34250,34250,ＳＥＲＩＥＳ
34251,34251,ｆａｖｏｕｒｉｔｅｓ


In [8]:
tags_join_DF = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')
tags_join_DF.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read


In [9]:
tags_join_DF.sort_values('goodreads_book_id')

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
258362,1,33165,338,youth
250046,1,17213,347,kindle
246919,1,27535,348,shelfari-favorites
245888,1,16799,351,juvenile
...,...,...,...,...
918363,33288638,2541,9,angsty
956326,33288638,1126,7,5-star-reads
358501,33288638,18680,11,loved
954279,33288638,29125,10,sweet-romance


In [10]:
to_read = pd.read_csv('dataset/to_read.csv')
to_read.head()

Unnamed: 0,user_id,book_id
0,1,112
1,1,235
2,1,533
3,1,1198
4,1,1874


Функция **TfidfVectorizer** из scikit-learn, которая преобразует **текст в векторы признаков**, которые можно использовать в качестве входных данных для модели.

  **Косинусное сходство** для вычисления числового значения, обозначающего сходство между двумя книгами.

In [11]:
books['authors']

0                    Suzanne Collins
1       J.K. Rowling, Mary GrandPrÃ©
2                    Stephenie Meyer
3                         Harper Lee
4                F. Scott Fitzgerald
                    ...             
9995                   Ilona Andrews
9996                  Robert A. Caro
9997                 Patrick O'Brian
9998                 Peggy Orenstein
9999                     John Keegan
Name: authors, Length: 10000, dtype: object

In [12]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(books['authors'])#(books['title'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
tfidf_matrix

<10000x14742 sparse matrix of type '<class 'numpy.float64'>'
	with 43235 stored elements in Compressed Sparse Row format>

In [14]:
for item in tfidf_matrix:
    print(item)
    break

  (0, 13166)	0.6040560087272662
  (0, 2647)	0.5635229978982601
  (0, 13165)	0.5635229978982601


Функция, которая возвращает 20 наиболее похожих книг на основе оценки косинусного сходства.

In [15]:
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

# Функция, возвращающая рекомендацию книг, основанную на cos sim авторов книг
def authors_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [16]:
authors_recommendations('The Hobbit').head(20)

18      The Fellowship of the Ring (The Lord of the Ri...
154            The Two Towers (The Lord of the Rings, #2)
160     The Return of the King (The Lord of the Rings,...
188     The Lord of the Rings (The Lord of the Rings, ...
963     J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4975        Unfinished Tales of NÃºmenor and Middle-Earth
2308                               The Children of HÃºrin
610              The Silmarillion (Middle-Earth Universe)
8271                   The Complete Guide to Middle-Earth
1128     The History of the Hobbit, Part One: Mr. Baggins
465                             The Hobbit: Graphic Novel
0                 The Hunger Games (The Hunger Games, #1)
1       Harry Potter and the Sorcerer's Stone (Harry P...
2                                 Twilight (Twilight, #1)
3                                   To Kill a Mockingbird
4                                        The Great Gatsby
5                                  The Fault in Our Stars
7             

Будем рекомендовать книги, используя теги, предоставленные для книг.

In [17]:
books_with_tags = pd.merge(books, tags_join_DF, left_on='book_id', right_on='goodreads_book_id', how='inner')

In [18]:
books_with_tags

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,goodreads_book_id,tag_id,count,tag_name
0,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,30574,11314,to-read
1,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,11305,10836,fantasy
2,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,11557,50755,favorites
3,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,8717,35418,currently-reading
4,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,33114,25968,young-adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999907,10000,8914,8914,11817,31,375700455,9.780376e+12,John Keegan,1998.0,The First World War,...,345,2031,4138,3069,https://images.gr-assets.com/books/1403194704m...,https://images.gr-assets.com/books/1403194704s...,8914,16529,4,john-keegan
999908,10000,8914,8914,11817,31,375700455,9.780376e+12,John Keegan,1998.0,The First World War,...,345,2031,4138,3069,https://images.gr-assets.com/books/1403194704m...,https://images.gr-assets.com/books/1403194704s...,8914,32805,3,world-war
999909,10000,8914,8914,11817,31,375700455,9.780376e+12,John Keegan,1998.0,The First World War,...,345,2031,4138,3069,https://images.gr-assets.com/books/1403194704m...,https://images.gr-assets.com/books/1403194704s...,8914,32156,3,war-ww1
999910,10000,8914,8914,11817,31,375700455,9.780376e+12,John Keegan,1998.0,The First World War,...,345,2031,4138,3069,https://images.gr-assets.com/books/1403194704m...,https://images.gr-assets.com/books/1403194704s...,8914,20285,3,modern-european-history


In [19]:
tf1 = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix1 = tf1.fit_transform(books_with_tags['tag_name'].head(10000))
cosine_sim1 = linear_kernel(tfidf_matrix1, tfidf_matrix1)

In [20]:
cosine_sim1

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [21]:

titles1 = books['title']
indices1 = pd.Series(books.index, index=books['title'])

def tags_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim1[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [22]:
tags_recommendations('The Hobbit').head(20)

16                  Catching Fire (The Hunger Games, #2)
31                                       Of Mice and Men
107         Confessions of a Shopaholic (Shopaholic, #1)
125                            Dune (Dune Chronicles #1)
149                                         The Red Tent
206               One for the Money (Stephanie Plum, #1)
214                                     Ready Player One
231                  The Gunslinger (The Dark Tower, #1)
253               Shiver (The Wolves of Mercy Falls, #1)
313                              Inkheart (Inkworld, #1)
325                                       White Oleander
405       The New Drawing on the Right Side of the Brain
412                                 The Three Musketeers
425                              A Confederacy of Dunces
505                          The One (The Selection, #3)
513                    The Adventures of Sherlock Holmes
525                  Darkly Dreaming Dexter (Dexter, #1)
566                            

Рекомендация книг с использованием авторов и тегов одновременно.

In [23]:
temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head()

Unnamed: 0,book_id,tag_name
0,1,to-read fantasy favorites currently-reading yo...
1,2,to-read fantasy favorites currently-reading yo...
2,3,to-read fantasy favorites currently-reading yo...
3,5,to-read fantasy favorites currently-reading yo...
4,6,to-read fantasy young-adult fiction harry-pott...


In [24]:
books = pd.merge(books, temp_df, left_on='book_id', right_on='book_id', how='inner')

In [25]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,to-read fantasy favorites currently-reading yo...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,to-read fantasy favorites currently-reading yo...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,to-read fantasy favorites currently-reading yo...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,to-read favorites currently-reading young-adul...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,to-read favorites currently-reading young-adul...


In [26]:
books['corpus'] = (pd.Series(books[['authors', 'tag_name']]
                .fillna('')
                .values.tolist()
                ).str.join(' '))

books['corpus'].head()

0    Suzanne Collins to-read fantasy favorites curr...
1    J.K. Rowling, Mary GrandPrÃ© to-read fantasy f...
2    Stephenie Meyer to-read fantasy favorites curr...
3    Harper Lee to-read favorites currently-reading...
4    F. Scott Fitzgerald to-read favorites currentl...
Name: corpus, dtype: object

In [27]:
tf_corpus = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix_corpus = tf_corpus.fit_transform(books['corpus'])
cosine_sim_corpus = linear_kernel(tfidf_matrix_corpus, tfidf_matrix_corpus)

titles = books['title']
indices = pd.Series(books.index, index=books['title'])

def corpus_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim_corpus[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

corpus_recommendations("The Hobbit")

188     The Lord of the Rings (The Lord of the Rings, ...
154            The Two Towers (The Lord of the Rings, #2)
160     The Return of the King (The Lord of the Rings,...
18      The Fellowship of the Ring (The Lord of the Ri...
610              The Silmarillion (Middle-Earth Universe)
4975        Unfinished Tales of NÃºmenor and Middle-Earth
2308                               The Children of HÃºrin
963     J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
465                             The Hobbit: Graphic Novel
8271                   The Complete Guide to Middle-Earth
1366    The Once and Future King (The Once and Future ...
1321              The Last Unicorn (The Last Unicorn, #1)
53      The Hitchhiker's Guide to the Galaxy (Hitchhik...
367             The Subtle Knife (His Dark Materials, #2)
61            The Golden Compass (His Dark Materials, #1)
479           The Amber Spyglass (His Dark Materials, #3)
936          His Dark Materials (His Dark Materials #1-3)
331      The M

In [28]:
corpus_recommendations("Twilight (Twilight, #1)")

51                                 Eclipse (Twilight, #3)
48                                New Moon (Twilight, #2)
991                    The Twilight Saga (Twilight, #1-4)
833                         Midnight Sun (Twilight, #1.5)
731     The Short Second Life of Bree Tanner: An Eclip...
1618    The Twilight Saga Complete Collection  (Twilig...
4087    The Twilight Saga: The Official Illustrated Gu...
2020             The Twilight Collection (Twilight, #1-3)
72                                The Host (The Host, #1)
219     Twilight: The Complete Illustrated Movie Compa...
55                           Breaking Dawn (Twilight, #4)
3074    Twilight: The Graphic Novel, Vol. 1 (Twilight:...
1802    The Awakening / The Struggle (The Vampire Diar...
2393    The Fury / Dark Reunion (The Vampire Diaries, ...
418                   Blood Promise (Vampire Academy, #4)
383                       Frostbite (Vampire Academy, #2)
4820    Jessica's Guide to Dating on the Dark Side (Je...
1108    New Mo

In [29]:
corpus_recommendations("Romeo and Juliet")

352                                           Othello
769                                     Julius Caesar
124                                            Hamlet
153                                           Macbeth
247                         A Midsummer Night's Dream
838                            The Merchant of Venice
854                                     Twelfth Night
529                            Much Ado About Nothing
713                                         King Lear
772                           The Taming of the Shrew
3947                                          Henry V
3699                                      Richard III
7073    King Henry IV, Part 1 (Wars of the Roses, #2)
386                                      The Crucible
1665                            Shakespeare's Sonnets
7                              The Catcher in the Rye
714                               Death of a Salesman
6529                                 Titus Andronicus
27                          

In [30]:
ratings

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4
...,...,...,...
981751,10000,48386,5
981752,10000,49007,4
981753,10000,49383,5
981754,10000,50124,5


In [31]:
books

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name,corpus
0,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,to-read fantasy favorites currently-reading yo...,Suzanne Collins to-read fantasy favorites curr...
1,2,3,3,4640799,491,439554934,9.780440e+12,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,to-read fantasy favorites currently-reading yo...,"J.K. Rowling, Mary GrandPrÃ© to-read fantasy f..."
2,3,41865,41865,3212258,226,316015849,9.780316e+12,Stephenie Meyer,2005.0,Twilight,...,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,to-read fantasy favorites currently-reading yo...,Stephenie Meyer to-read fantasy favorites curr...
3,4,2657,2657,3275794,487,61120081,9.780061e+12,Harper Lee,1960.0,To Kill a Mockingbird,...,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,to-read favorites currently-reading young-adul...,Harper Lee to-read favorites currently-reading...
4,5,4671,4671,245494,1356,743273567,9.780743e+12,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,to-read favorites currently-reading young-adul...,F. Scott Fitzgerald to-read favorites currentl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,7130616,7130616,7392860,19,441019455,9.780441e+12,Ilona Andrews,2010.0,Bayou Moon,...,1180,105,575,3538,7860,6778,https://images.gr-assets.com/books/1307445460m...,https://images.gr-assets.com/books/1307445460s...,to-read fantasy favorites currently-reading fi...,Ilona Andrews to-read fantasy favorites curren...
9996,9997,208324,208324,1084709,19,067973371X,9.780680e+12,Robert A. Caro,1990.0,Means of Ascent,...,395,303,551,1737,3389,6972,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,to-read favorites currently-reading books-i-ow...,Robert A. Caro to-read favorites currently-rea...
9997,9998,77431,77431,2393986,60,039330762X,9.780393e+12,Patrick O'Brian,1977.0,The Mauritius Command,...,374,11,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...,to-read favorites currently-reading fiction bo...,Patrick O'Brian to-read favorites currently-re...
9998,9999,8565083,8565083,13433613,7,61711527,9.780062e+12,Peggy Orenstein,2011.0,Cinderella Ate My Daughter: Dispatches from th...,...,1988,275,1002,3765,4577,2375,https://images.gr-assets.com/books/1279214118m...,https://images.gr-assets.com/books/1279214118s...,to-read favorites currently-reading books-i-ow...,Peggy Orenstein to-read favorites currently-re...


Напоминание про ДЗ 1


### ДЗ 2. Content-based рекомендация.  Гибкий дедлайн 11 октября. Жесткий дедлайн 18 октября (оценка - 1 балл)

1. Приведите данные датасета ratings к виду датафрейма со строками-пользователями, столбцами-книгами и рейтингами на пересечении

user_vectors = #TO DO

(проверка: размерность датасета должна быть (53424, 10000) )

2. Создайте векторы, характеризующие пользоваталей (для content-based лучше это делать по дополнительным данным о пользователях, но так как тут у нас их нет - сделаем вектора-признаки (размерностью 100) на основе оценок) 

Делать будем с помощью метода понижения размерности PCA до 100.

3. Объедините три датасета:  
    * ratings
    * pca_user_vectors(векторы-признаки для каждого пользователя)
    * tf-idf на основе books['corpus'] для каждой книги
    
В итоге у вас должен получиться датафрейм с вектором пользователя, вектором книги и таргет-рейтинг.

4. Разбейте на train/valid set

5. Обучите любую модель машинного обучения (для задачи регрессии (линейную или ансамбль деревьев)), сделайте прогноз и посчитайте метрики RMSE, MSE
6. Добавьте другие признаки по книгам из books

7. Сделайте тестовый датасет для пользователей и книг, которые находятся в датасете to-read:
    * pca_user_vectors(векторы-признаки для каждого пользователя)
    * tf-idf на основе books['corpus'] для каждой книги
8. Сделайте для них прогноз

1. Приведите данные датасета ratings к виду датафрейма со строками-пользователями, столбцами-книгами и рейтингами на пересечении

In [32]:
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [33]:
np.unique(ratings.rating)

array([1, 2, 3, 4, 5])

In [34]:
user_vectors = ratings.pivot_table(values='rating', index='user_id', columns=['book_id']).fillna(0)

(проверка: размерность датасета должна быть (53424, 10000) )

In [35]:
assert user_vectors.shape == (53424, 10000)

2. Создайте векторы, характеризующие пользоваталей (для content-based лучше это делать по дополнительным данным о пользователях, но так как тут у нас их нет - сделаем вектора-признаки (размерностью 100) на основе оценок) 

Делать будем с помощью метода понижения размерности PCA до 100.

In [36]:
from sklearn.decomposition import PCA

pca = PCA(100)
pca_user_vectors = pca.fit_transform(user_vectors)

In [37]:
pca_user_vectors.shape

(53424, 100)

3. Объедините три датасета:  
    * ratings
    * pca_user_vectors(векторы-признаки для каждого пользователя)
    * tf-idf на основе books['corpus'] для каждой книги
    
В итоге у вас должен получиться датафрейм с вектором пользователя, вектором книги и таргет-рейтинг.

In [96]:
df_pca_user_vectors = pd.DataFrame(pca_user_vectors)

In [97]:
tf_corpus = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english', max_features=3000)
tfidf_matrix_corpus = tf_corpus.fit_transform(books['corpus'])

In [98]:
df_tfidf_matrix_corpus = pd.DataFrame(tfidf_matrix_corpus.toarray(), index=books['book_id'].values)

In [99]:
df_tfidf_matrix_corpus['book_id'] = df_tfidf_matrix_corpus.index
df_pca_user_vectors['user_id'] = df_pca_user_vectors.index

In [100]:
book_ratings = pd.merge(
    ratings, 
    df_tfidf_matrix_corpus, 
    on="book_id"
    )

In [101]:
dataset = pd.merge(
    book_ratings,
    df_pca_user_vectors,
    on='user_id'
)

In [102]:
target = dataset['rating']
dataset_bookid = dataset.drop(['user_id'], axis=1)
dataset.drop(['book_id', 'user_id', 'rating'], axis=1, inplace=True)

In [103]:
dataset.shape, target.shape

((79700, 3100), (79700,))

4. Разбейте на train/valid set

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [47]:
import warnings
warnings.simplefilter('ignore')

In [48]:
X_train, X_val, y_train, y_val = train_test_split(dataset, target, test_size=0.3, random_state=42)

In [49]:
X_train.shape

(55790, 3100)

5. Обучите любую модель машинного обучения (для задачи регрессии (линейную или ансамбль деревьев)), сделайте прогноз и посчитайте метрики RMSE, MSE

In [50]:
from sklearn.linear_model import ElasticNet

parameters = {"max_iter": [1, 5, 10],
                      "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                      "l1_ratio": np.arange(0.0, 1.0, 0.1)}
eNet = ElasticNet()

In [51]:
grid = GridSearchCV(eNet, parameters, scoring='neg_root_mean_squared_error', cv=5, verbose=1)
grid.fit(X_train.values, y_train.values)

Fitting 5 folds for each of 210 candidates, totalling 1050 fits


In [52]:
grid.best_params_

{'alpha': 0.0001, 'l1_ratio': 0.0, 'max_iter': 10}

In [53]:
print(f"MSE:\t%.4f" % mean_squared_error(grid.predict(X_val), y_val, squared=True))
print(f"RMSE:\t%.4f" % mean_squared_error(grid.predict(X_val), y_val, squared=False))

MSE:	0.9002
RMSE:	0.9488


6. Добавьте другие признаки по книгам из books

In [55]:
books.columns

Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
       'isbn13', 'authors', 'original_publication_year', 'original_title',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
       'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
       'small_image_url', 'tag_name', 'corpus'],
      dtype='object')

In [57]:
books[['book_id', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5']]

Unnamed: 0,book_id,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5
0,2767052,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317
1,3,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543
2,41865,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439
3,2657,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267
4,4671,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718
...,...,...,...,...,...,...,...,...,...,...
9995,7130616,4.09,17204,18856,1180,105,575,3538,7860,6778
9996,208324,4.25,12582,12952,395,303,551,1737,3389,6972
9997,77431,4.35,9421,10733,374,11,111,1191,4240,5180
9998,8565083,3.65,11279,11994,1988,275,1002,3765,4577,2375


In [104]:
book_features = books[['book_id', 'average_rating', 'ratings_count', 'work_ratings_count', 'work_text_reviews_count', 'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5']]

In [105]:
book_features.head()

Unnamed: 0,book_id,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5
0,2767052,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317
1,3,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543
2,41865,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439
3,2657,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267
4,4671,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718


In [106]:
dataset_bookid.head()

Unnamed: 0,book_id,rating,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,...,90_y,91_y,92_y,93_y,94_y,95_y,96_y,97_y,98_y,99_y
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.421241,0.237011,-0.167741,0.136411,0.475038,-0.19723,0.330447,-0.772861,0.110288,0.474293
1,3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.421241,0.237011,-0.167741,0.136411,0.475038,-0.19723,0.330447,-0.772861,0.110288,0.474293
2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.421241,0.237011,-0.167741,0.136411,0.475038,-0.19723,0.330447,-0.772861,0.110288,0.474293
3,6,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.421241,0.237011,-0.167741,0.136411,0.475038,-0.19723,0.330447,-0.772861,0.110288,0.474293
4,29,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.421241,0.237011,-0.167741,0.136411,0.475038,-0.19723,0.330447,-0.772861,0.110288,0.474293


In [107]:
dataset_with_book_features = pd.merge(
    book_features,
    dataset_bookid,
    on='book_id'
)

In [108]:
dataset_with_book_features.head()

Unnamed: 0,book_id,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,...,90_y,91_y,92_y,93_y,94_y,95_y,96_y,97_y,98_y,99_y
0,3,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,...,-0.421241,0.237011,-0.167741,0.136411,0.475038,-0.19723,0.330447,-0.772861,0.110288,0.474293
1,3,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,...,0.385063,1.008911,-0.41497,2.115023,0.919517,-0.497703,2.455959,-0.310771,-1.068033,1.048186
2,3,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,...,-0.043937,0.005112,-0.000412,-0.005811,-0.030085,-0.006276,-0.036566,-0.01334,0.030327,-0.024631
3,3,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,...,0.189652,0.07066,0.1434,-0.388571,-0.012347,0.050889,-0.088461,-0.007984,0.086106,0.304519
4,3,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,...,0.079558,-0.126778,0.032393,-0.210305,0.042281,-0.00848,0.02446,0.122819,0.057945,-0.009429


In [109]:
dataset_with_book_features.shape

(79700, 3111)

In [110]:
train_target = dataset_with_book_features['rating']
train_data = dataset_with_book_features.drop(['book_id', 'rating'], axis=1)

In [113]:
from sklearn.preprocessing import MinMaxScaler

In [114]:
scaler = MinMaxScaler()

In [115]:
data = scaler.fit_transform(train_data)

In [118]:
X_train, X_val, y_train, y_val = train_test_split(data, train_target, test_size=0.3, random_state=42)

In [119]:
eNet = ElasticNet(alpha=0.0001, l1_ratio=0.0, max_iter=10)

In [120]:
eNet.fit(X_train, y_train)

In [121]:
print(f"MSE:\t%.4f" % mean_squared_error(eNet.predict(X_val), y_val, squared=True))
print(f"RMSE:\t%.4f" % mean_squared_error(eNet.predict(X_val), y_val, squared=False))

MSE:	0.8998
RMSE:	0.9486


7. Сделайте тестовый датасет для пользователей и книг, которые находятся в датасете to-read:
    * pca_user_vectors(векторы-признаки для каждого пользователя)
    * tf-idf на основе books['corpus'] для каждой книги
8. Сделайте для них прогноз

In [125]:
to_read.head()

Unnamed: 0,user_id,book_id
0,1,112
1,1,235
2,1,533
3,1,1198
4,1,1874


In [126]:
book_to_read = pd.merge(
    to_read,
    df_tfidf_matrix_corpus,
    on='book_id'
)

In [128]:
test_ds = pd.merge(
    book_to_read,
    df_pca_user_vectors,
    on='user_id'
)

In [130]:
test_ds.drop(['book_id', 'user_id'], axis=1, inplace=True)

In [131]:
result = grid.predict(test_ds)

In [132]:
result

array([3.95529603, 3.93817165, 3.7806352 , ..., 4.16473473, 4.20462114,
       4.21564284])

###

В 6 пункте можно получить rmse,mse еще ниже поигравшись с feature engineering 

Таже в целом в работе, большая часть гиперпараметров моделей не тюнилась (Кроме 5 пункта)