In [382]:
import pandas as pd
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings('ignore')

In [383]:
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_colwidth', None)

In [384]:
books = pd.read_csv("BX-Books.csv")
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0374157065.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0374157065.01.LZZZZZZZ.jpg
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0393045218.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0393045218.01.LZZZZZZZ.jpg


In [385]:
interactions = pd.read_csv("BX-Book-Ratings.csv", sep=";", encoding = "ISO-8859-1")
interactions = interactions[interactions["Book-Rating"] != 0]
interactions.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276736,3257224281,8
7,276737,0600570967,6


In [386]:
books_meets = interactions.groupby("ISBN")["User-ID"].count().reset_index().rename(columns={"User-ID": "user_num"})
books_meets.head()

Unnamed: 0,ISBN,user_num
0,0330299891,1
1,0375404120,1
2,9022906116,1
3,#6612432,1
4,'9607092910',1


In [387]:
user_meets = interactions.groupby("User-ID")["ISBN"].count().reset_index().rename(columns={"ISBN": "books_num"})
user_meets.head()

Unnamed: 0,User-ID,books_num
0,8,7
1,9,1
2,10,1
3,12,1
4,14,3


In [388]:
interactions = interactions.merge(books_meets, on=["ISBN"]).merge(user_meets, on=["User-ID"])

In [389]:
interactions = interactions[(interactions["user_num"] > 5) & 
                            (interactions["books_num"] > 5) &
                            (interactions["books_num"] < 200)]
interactions.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,user_num,books_num
55,276822,60096195,10,53,15
59,276822,375821813,9,10,15
62,276822,552546933,9,17,15
66,276822,786817070,10,74,15
77,276847,3257200552,5,12,36


In [390]:
users = pd.read_csv('BX-Users.csv', delimiter=';', encoding = 'ISO-8859-1')
users.head(150)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",
5,6,"santa monica, california, usa",61.0
6,7,"washington, dc, usa",
7,8,"timmins, ontario, canada",
8,9,"germantown, tennessee, usa",
9,10,"albacete, wisconsin, spain",26.0


# Персональный топ

In [391]:
users_with_age = users.dropna(subset=['Age'])

location_split = users_with_age['Location'].str.split(', ', expand=True)
users_with_age['City'] = location_split[0]
users_with_age['State'] = location_split[1]
users_with_age['Country'] = location_split[2]

users_with_age['Country'] = users_with_age['Country'].str.strip()

country_counts = users_with_age['Country'].value_counts()

# Топ 50 стран
top_countries = country_counts.head(50).index

users_filtered = users_with_age[users_with_age['Country'].isin(top_countries)]

interactions_filtered = interactions.merge(users_filtered, on='User-ID').merge(books, on='ISBN')

# Книги с хорошим рейтингом
interactions_filtered = interactions_filtered[interactions_filtered['Book-Rating'] > 5]

# Топ книг
def get_top_books(group):
    top_books = group.groupby(['Book-Title'])['Book-Rating'].mean().sort_values(ascending=False).head(10)
    return top_books

interactions_filtered['Age-Group'] = pd.cut(interactions_filtered['Age'], bins=[0, 18, 25, 55, 100], right=False)
top_books_by_country_age = interactions_filtered.groupby(['Country', 'Age-Group']).apply(get_top_books).reset_index()

top_books_by_country_age

Unnamed: 0,Country,Age-Group,Book-Title,Book-Rating
0,argentina,"[25, 55)",Cyrano De Bergerac (Bantam Classics),10.0
1,argentina,"[25, 55)",Interview with the Vampire,10.0
2,argentina,"[25, 55)",Othello (Bantam Classics),9.0
3,argentina,"[25, 55)",Alice's Adventures in Wonderland and Through the Looking Glass (Signet Classics (Paperback)),8.0
4,argentina,"[25, 55)",Hamlet (Bantam Classics),8.0
5,australia,"[0, 18)",The Blue Day Book: A Lesson in Cheering Yourself Up,10.0
6,australia,"[0, 18)",The Worst-Case Scenario Survival Handbook,10.0
7,australia,"[0, 18)",Memoirs of a Geisha Uk,10.0
8,australia,"[0, 18)",Life of Pi,10.0
9,australia,"[0, 18)",Girl with a Pearl Earring,10.0


In [392]:
interactions = interactions.merge(books[["ISBN", "Image-URL-M", "Book-Title"]].rename(
    columns={"Image-URL-M": "picture_url"}), on=["ISBN"])
interactions.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,user_num,books_num,picture_url,Book-Title
0,276822,60096195,10,53,15,http://images.amazon.com/images/P/0060096195.01.MZZZZZZZ.jpg,The Boy Next Door
1,276822,375821813,9,10,15,http://images.amazon.com/images/P/0375821813.01.MZZZZZZZ.jpg,Hoot (Newbery Honor Book)
2,276822,786817070,10,74,15,http://images.amazon.com/images/P/0786817070.01.MZZZZZZZ.jpg,"Artemis Fowl (Artemis Fowl, Book 1)"
3,276847,3404148576,8,10,36,http://images.amazon.com/images/P/3404148576.01.MZZZZZZZ.jpg,Nordermoor
4,276847,3423071516,10,13,36,http://images.amazon.com/images/P/3423071516.01.MZZZZZZZ.jpg,Der Kleine Hobbit


In [393]:
from sklearn import preprocessing   
le = preprocessing.LabelEncoder()

In [394]:
interactions["book_id"] = le.fit_transform(interactions["ISBN"])
interactions["vid"] = le.fit_transform(interactions["User-ID"])

In [395]:
interactions.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,user_num,books_num,picture_url,Book-Title,book_id,vid
0,276822,60096195,10,53,15,http://images.amazon.com/images/P/0060096195.01.MZZZZZZZ.jpg,The Boy Next Door,98,10883
1,276822,375821813,9,10,15,http://images.amazon.com/images/P/0375821813.01.MZZZZZZZ.jpg,Hoot (Newbery Honor Book),2737,10883
2,276822,786817070,10,74,15,http://images.amazon.com/images/P/0786817070.01.MZZZZZZZ.jpg,"Artemis Fowl (Artemis Fowl, Book 1)",8884,10883
3,276847,3404148576,8,10,36,http://images.amazon.com/images/P/3404148576.01.MZZZZZZZ.jpg,Nordermoor,10402,10884
4,276847,3423071516,10,13,36,http://images.amazon.com/images/P/3423071516.01.MZZZZZZZ.jpg,Der Kleine Hobbit,10409,10884


In [396]:
csr_rates = coo_matrix((interactions["Book-Rating"], (interactions["vid"], interactions["book_id"])), 
                            shape=(len(set(interactions["vid"])), len(set(interactions["book_id"]))))

# Совстречаемость

In [397]:
import tqdm
import pandas as pd

class Recommendations():
    def __init__(self, Int):
        # Создаём словарь для того, чтобы по book_id возвращать url товара
        self.book_id_to_url = {}
        for i in Int[["ISBN", "picture_url"]].drop_duplicates().values:
            self.book_id_to_url[i[0]] = i[1]
        self.interactions = Int
        
    def coocurrency_count(self):
        Int = self.interactions.drop_duplicates()
        # Для каждого пользователя собираем книги, которые он покупал
        user_books = Int.groupby(["vid"])[["ISBN", "Book-Rating"]].apply(list).reset_index()
        
        # Заводим словарь под товары
        cooc = {}
        for i in tqdm.tqdm_notebook(user_books.values):
            for j in range(len(i[1])):
                for k in range(len(i[1])):
                    if j != k:
                        book1_id = i[1][j][0]
                        book2_id = i[1][k][0]
                        book1_rating = i[1][j][1]
                        book2_rating = i[1][k][1]
                        
                        key = str(book1_id) + "_" + str(book2_id)
                        
                        try:
                            cooc[key] += (book1_rating + book2_rating) / 2
                        except:
                            cooc[key] = (book1_rating + book2_rating) / 2
        cooc_list = []
        for i, j in cooc.items():
            # Если книги встетились вместе больше одного раза, то добавляем в список
            cooc_list.append(i.split("_") + [j])
        self.cooc_rec = pd.DataFrame(cooc_list, columns=["book1", "book2", "measure"])
    
    def get_rec(self, i, show=False):
        recs = self.cooc_rec[self.cooc_rec["book1"] == str(i)]\
                            .sort_values("measure", ascending=False)\
                            .head(10)
        print(u"Для книги")
        rec_imaging([i], self.book_id_to_url)
        print(u"Такие рекомендации")
        rec_imaging(recs["book2"].values.astype(int), self.book_id_to_url, 
                         recs["measure"].values.astype(int))

In [398]:
cooc_rec = Recomendations(interactions)
cooc_rec.coocurrency_count()

  0%|          | 0/10012 [00:00<?, ?it/s]

In [399]:
cooc_rec.cooc_rec.sort_values("measure", ascending=False).head()

Unnamed: 0,book1,book2,measure
804,439136350,439064864,627.0
747,439064864,439136350,627.0
807,439136350,439139597,570.0
864,439139597,439136350,570.0
756,439064864,590353403,562.0


In [400]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

interaction_matrix = interactions.pivot_table(index='vid', columns='book_id', values='Book-Rating')


interaction_matrix.fillna(0, inplace=True)


X_train, X_test = train_test_split(interaction_matrix, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [402]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(X_train_scaled)

def get_similar_users(user_id, n=5):
    similar_users = similarity_matrix[user_id].argsort()[-n:]
    return similar_users[::-1]

def u2i_recommendations(user_id, n=10):
    similar_users = get_similar_users(user_id)
    recommended_items = set()
    for user in similar_users:
        recommended_items.update(set(X_train.columns[X_train.loc[user] > 0]))
    recommended_items.difference_update(set(X_train.columns[X_train.loc[user_id] > 0]))
    return list(recommended_items)[:n]

print(u2i_recommendations(5000))

[1664, 2246, 6538, 1164, 205, 1657, 5394, 8278, 7193, 7993]
