In [1]:
import pandas as pd
import json
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier, Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import word_tokenize
from collections import Counter
import string
from collections import defaultdict
from scipy.sparse import hstack
import re
import time

# Load data

In [2]:
with open('filter_book.json') as f:
    whole_data = [json.loads(d) for d in f.readlines()]

In [3]:
whole_data[1]

{'asin': ' ',
 'authors': [{'author_id': '5321960', 'role': ''}],
 'average_rating': '4.05',
 'book_id': '17339335',
 'country_code': 'US',
 'cover': 'Knight Avenged',
 'description': "<b>Alone in a world on the brink of war...two unlikely allies will discover a love greater than time.</b><br /><br />Exiled from her home, powerful oracle Cosmina Cordei holds the key to uniting those protecting mankind from evil. But just as she makes her way into the holy city to perform an ancient rite, the enemy closes in for the kill...<br /><br />Drawn by a destiny he won't accept, elite assassin Henrik Lazar detests the mystical curse handed down by his mother. But when the sorcery in his blood is activated and past pain comes back to haunt him, his new abilities come into play and he must learn to control them.<br /><br />Rescued by Henrik in the heat of battle, Cosmina must decide whether to trust the assassin who loathes the goddess she serves or face certain death on her own. Forced into an un

In [4]:
# obtain all categories
cats_with_counts = defaultdict(int)
for d in whole_data:
    for g,i in d['genre'].items():
        cats_with_counts[g] += i

In [5]:
book_ids = set([d['book_id'] for d in whole_data])

In [6]:
# class distribution
sum_cat_count = sum([k for k in cats_with_counts.values()])
for k,v in cats_with_counts.items():
    print(k, v/sum_cat_count)

fiction 0.22935386545496342
history, historical fiction, biography 0.11218489683274621
fantasy, paranormal 0.19868476542109198
romance 0.10883620771799321
young-adult 0.07738126345084886
non-fiction 0.06895897491625524
mystery, thriller, crime 0.08572122383339018
comics, graphic 0.058846502971987
children 0.04873140533562655
poetry 0.011300894065097345


In [7]:
book_id_description = dict()
book_id_cover = dict()
book_id_num_pages = dict()
book_id_rate = dict()
book_id_similar = dict()
book_id_shelves = dict()
for d in whole_data:
    b_id = d['book_id']
    des = re.sub('<[^<]+?>', '', d['description'])
    book_id_description[b_id] = des
    book_id_cover[b_id] = d['cover']
    if 'num_pages' in d:
        book_id_num_pages[b_id] = d['num_pages']
    if 'average_rating' and 'ratings_count' in d:
        book_id_rate[b_id] = (d['average_rating'],d['ratings_count'])
    book_id_similar[b_id] = set(d['similar_books']).intersection(book_ids)
    book_id_shelves[b_id] = d['popular_shelves']

In [8]:
train_data = whole_data[:len(whole_data)//2]
validation_data = whole_data[len(whole_data)//2:int(0.7*len(whole_data))]
test_data = whole_data[int(0.7*len(whole_data)):]

In [9]:
print(len(train_data))
print(len(validation_data))
print(len(test_data))

50000
20000
30000


# Naive model

In [10]:
def most_freq(sort_genre):
    """
    sort_genre: [(genre, count)]
    """
    freq = sort_genre[0][1]
    pick = []
    for d in sort_genre:
        if d[1] == freq:
            pick.append(d[0])
        else:
            break
    return np.random.choice(pick)

In [11]:
book_id_genre = dict()
for d in whole_data:
    sort_genre = sorted(d['genre'].items(),key=lambda x: x[1],reverse=True)
    book_id_genre[d['book_id']] = most_freq(sort_genre)

In [12]:
genre_list = list(cats_with_counts)
genre_oh = dict()
genre_idx = dict()
for d in genre_list:
    tmp = np.zeros(len(genre_list))
    tmp[genre_list.index(d)] = 1
    genre_oh[d] = tmp
    genre_idx[d] = genre_list.index(d)

# Library models

#### Tfidf with description

In [73]:
all_description = []
all_y = []
punctuation = set(string.punctuation)
for d in whole_data:
    r = ''.join([c for c in book_id_description[d['book_id']].lower() if not c in punctuation])
    all_description.append(r)
    all_y.append(genre_idx[book_id_genre[d['book_id']]])

In [74]:
train_description = []
train_y = []
punctuation = set(string.punctuation)
for d in train_data:
    r = ''.join([c for c in book_id_description[d['book_id']].lower() if not c in punctuation])
    train_description.append(r)
    train_y.append(genre_idx[book_id_genre[d['book_id']]])

In [75]:
val_description = []
val_y = []
punctuation = set(string.punctuation)
for d in validation_data:
    r = ''.join([c for c in book_id_description[d['book_id']].lower() if not c in punctuation])
    val_description.append(r)
    val_y.append(genre_idx[book_id_genre[d['book_id']]])

In [76]:
test_description = []
test_y = []
punctuation = set(string.punctuation)
for d in test_data:
    r = ''.join([c for c in book_id_description[d['book_id']].lower() if not c in punctuation])
    test_description.append(r)
    test_y.append(genre_idx[book_id_genre[d['book_id']]])

In [77]:
vectorizer = TfidfVectorizer(strip_accents='unicode',
                             analyzer='word',
                             lowercase=True,
                             stop_words='english',
                             sublinear_tf=True)

In [78]:
tfidf_mx = vectorizer.fit(all_description)
X_train = vectorizer.transform(train_description + val_description)
# X_val = vectorizer.transform(val_description)
X_test = vectorizer.transform(test_description)

In [79]:
all_title = []
for d in whole_data:
    r = ''.join([c for c in book_id_cover[d['book_id']].lower() if not c in punctuation])
    all_title.append(r)
    
train_title = []
for d in train_data:
    r = ''.join([c for c in book_id_cover[d['book_id']].lower() if not c in punctuation])
    train_title.append(r)
    
val_title = []
for d in validation_data:
    r = ''.join([c for c in book_id_cover[d['book_id']].lower() if not c in punctuation])
    val_title.append(r)
    
test_title = []
for d in test_data:
    r = ''.join([c for c in book_id_cover[d['book_id']].lower() if not c in punctuation])
    test_title.append(r)
    
vectorizer_title = TfidfVectorizer(strip_accents='unicode',
                             analyzer='word',
                             lowercase=True,
                             stop_words='english',
                             sublinear_tf=True)

tfidf_mx_title = vectorizer_title.fit(all_title)
title_feat_train = vectorizer_title.transform(train_title + val_title)
# title_feat_val = vectorizer_title.transform(val_title)
title_feat_test = vectorizer_title.transform(test_title)

X_train_2 = hstack([X_train, title_feat_train])
# X_val_2 = hstack([X_val, title_feat_val])

In [81]:
X_test_2 = hstack([X_test, title_feat_test])

In [55]:
# wts = dict()
# for d in cats_with_counts:
#     wts[genre_list.index(d)] = sum_cat_count / cats_with_counts[d]

In [56]:
def get_acc(predictions, val_y):
    acc = sum([1 for i in range(len(predictions)) if predictions[i] == val_y[i]]) / len(predictions)
    return acc

In [80]:
all_train_y = train_y + val_y
svc = LinearSVC(C=0.1)
svc.fit(X_train_2,all_train_y)

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [82]:
predictions = svc.predict(X_test_2)
print(get_acc(predictions,test_y))

0.6788


In [100]:
# feat: [description | title]
start = time.time()
mlp = MLPClassifier(hidden_layer_sizes=(100,100),learning_rate='adaptive',early_stopping=True)
mlp.fit(X_train_2,all_train_y)
predictions = mlp.predict(X_test_2)
print(time.time() - start)
print(get_acc(predictions,test_y))

4635.4024930000305
0.6923666666666667


In [83]:
train_cat_dict = defaultdict(str)
t_data = train_data + validation_data
for d in t_data:
    cat_list = [(genre, count) for genre,count in d['genre'].items()]
    main_cat = max(cat_list, key = lambda x: x[1])[0]
    train_cat_dict[d['book_id']] = main_cat

sim_cat_feat_train = []
for d in train_data:
    similar_list = book_id_similar[d['book_id']]
    sim_cat_list = [train_cat_dict[id] for id in similar_list]
    score_list = [0]*10
    for cat in sim_cat_list:
        if cat != "":
            score_list[genre_idx[cat]] += 1
    feat_arr = np.array(score_list)
    if sum(feat_arr) != 0:
        feat_arr = feat_arr / sum(feat_arr)
    sim_cat_feat_train.append(feat_arr)
    
sim_cat_feat_val = []
for d in validation_data:
    similar_list = book_id_similar[d['book_id']]
    sim_cat_list = [train_cat_dict[id] for id in similar_list]
    score_list = [0]*10
    for cat in sim_cat_list:
        if cat != "":
            score_list[genre_idx[cat]] += 1
    feat_arr = np.array(score_list)
    if sum(feat_arr) != 0:
        feat_arr = feat_arr / sum(feat_arr)
    sim_cat_feat_val.append(feat_arr)
    
X_train_3 = hstack([X_train_2, sim_cat_feat_train + sim_cat_feat_val])
# X_val_3 = hstack([X_val_2, sim_cat_feat_val])

sim_cat_feat_test = []
for d in test_data:
    similar_list = book_id_similar[d['book_id']]
    sim_cat_list = [train_cat_dict[id] for id in similar_list]
    score_list = [0]*10
    for cat in sim_cat_list:
        if cat != "":
            score_list[genre_idx[cat]] += 1
    feat_arr = np.array(score_list)
    if sum(feat_arr) != 0:
        feat_arr = feat_arr / sum(feat_arr)
    sim_cat_feat_test.append(feat_arr)
    
X_test_3 = hstack([X_test_2, sim_cat_feat_test])

In [84]:
svc = LinearSVC(C=0.1)
svc.fit(X_train_3,all_train_y)

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [85]:
predictions = svc.predict(X_test_3)
print(get_acc(predictions,test_y))

0.7068


In [86]:
t_train = train_data + validation_data
rate_count_feat_train = np.log(np.array([np.max([int(d['ratings_count']), 0.1]) for d in t_train]))
rate_count_mean = np.mean(rate_count_feat_train)
rate_count_std = np.std(rate_count_feat_train)
rate_count_feat_train = (rate_count_feat_train - rate_count_mean) / (3 * rate_count_std)
rate_count_feat_train = np.reshape(rate_count_feat_train, (len(rate_count_feat_train), 1))

# rate_count_feat_val = np.log(np.array([np.max([int(d['ratings_count']), 0.1]) for d in validation_data]))
# rate_count_mean = np.mean(rate_count_feat_val)
# rate_count_std = np.std(rate_count_feat_val)
# rate_count_feat_val = (rate_count_feat_val - rate_count_mean) / (3 * rate_count_std)
# rate_count_feat_val = np.reshape(rate_count_feat_val, (len(rate_count_feat_val), 1))

X_train_4 = hstack([X_train_3, rate_count_feat_train])
# X_val_4 = hstack([X_val_3, rate_count_feat_val])

rate_count_feat_test = np.log(np.array([np.max([int(d['ratings_count']), 0.1]) for d in test_data]))
rate_count_mean = np.mean(rate_count_feat_test)
rate_count_std = np.std(rate_count_feat_test)
rate_count_feat_test = (rate_count_feat_test - rate_count_mean) / (3 * rate_count_std)
rate_count_feat_test = np.reshape(rate_count_feat_test, (len(rate_count_feat_test), 1))

In [88]:
X_test_4 = hstack([X_test_3, rate_count_feat_test])

In [87]:
svc = LinearSVC(C=0.1)
svc.fit(X_train_4,all_train_y)

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [107]:
predictions = svc.predict(X_test_4)
print(get_acc(predictions,test_y))

0.7074333333333334


In [115]:
flattened_test = np.array(test_y).reshape(1,np.array(test_y).shape[0])[0]
flattened_pred = np.array(predictions).reshape(1,np.array(predictions).shape[0])[0]
print(recall_score(flattened_test,flattened_pred))
print(precision_score(flattened_test,flattened_pred))
print(f1_score(flattened_test,flattened_pred))

ValueError: Target is multiclass but average='binary'. Please choose another average setting.

In [101]:
# feat: [description | title | similar books | rating counts]
start = time.time()
mlp = MLPClassifier(hidden_layer_sizes=(100,100),learning_rate='adaptive',early_stopping=True)
mlp.fit(X_train_4,all_train_y)
predictions = mlp.predict(X_test_4)
print(time.time() - start)
print(get_acc(predictions,test_y))

4796.206936836243
0.7165666666666667


# Multilable attempt

In [90]:
def multilabel_convert(genres):
    tmp = np.zeros(len(genre_list))
    for g in genres:
        tmp[genre_list.index(g)] = 1
    return tmp

In [91]:
multi_book_to_genre_oh = dict()
for d in whole_data:
    multi_book_to_genre_oh[d['book_id']] = multilabel_convert(list(d['genre']))

In [92]:
mul_train_y = []
punctuation = set(string.punctuation)
for d in train_data:
    mul_train_y.append(multi_book_to_genre_oh[d['book_id']])

In [93]:
mul_val_y = []
punctuation = set(string.punctuation)
for d in validation_data:
    mul_val_y.append(multi_book_to_genre_oh[d['book_id']])

In [94]:
mul_test_y = []
punctuation = set(string.punctuation)
for d in test_data:
    mul_test_y.append(multi_book_to_genre_oh[d['book_id']])

In [None]:
all_mul_train_y = mul_train_y + mul_val_y

In [102]:
start = time.time()
clf = RandomForestClassifier()
clf.fit(X_train_4,all_mul_train_y)
predictions = clf.predict(X_test_4)
print(time.time() - start)

402.39863777160645


In [97]:
acc = 0
total = 0
for i,p in enumerate(predictions):
    teacher = mul_test_y[i]
    for j,d in enumerate(p):
        if teacher[j] == d:
            acc += 1
        total += 1

In [98]:
acc / total

0.8288633333333333

In [99]:
flattened_test = np.array(mul_test_y).reshape(1,np.array(mul_test_y).shape[0]*np.array(mul_test_y).shape[1])[0]
flattened_pred = np.array(predictions).reshape(1,np.array(predictions).shape[0]*np.array(predictions).shape[1])[0]
print(recall_score(flattened_test,flattened_pred))
print(precision_score(flattened_test,flattened_pred))
print(f1_score(flattened_test,flattened_pred))

0.407394498365
0.809860093271
0.542092917473


In [67]:
# previous value from pure description
# flattened_test = np.array(mul_test_y).reshape(1,np.array(mul_test_y).shape[0]*np.array(mul_test_y).shape[1])[0]
# flattened_pred = np.array(predictions).reshape(1,np.array(predictions).shape[0]*np.array(predictions).shape[1])[0]
# print(recall_score(flattened_test,flattened_pred))
# print(precision_score(flattened_test,flattened_pred))
# print(f1_score(flattened_test,flattened_pred))

0.363218064621
0.761053302187
0.491746593669


In [None]:
# previous_w_des_acc = 81.4795

In [103]:
start = time.time()
mlp = MLPClassifier(hidden_layer_sizes=(100,100),learning_rate='adaptive',early_stopping=True)
mlp.fit(X_train_4,all_mul_train_y)
predictions = mlp.predict(X_test_4)
print(time.time() - start)

5300.544007062912


In [104]:
acc = 0
total = 0
for i,p in enumerate(predictions):
    teacher = mul_test_y[i]
    for j,d in enumerate(p):
        if teacher[j] == d:
            acc += 1
        total += 1

In [105]:
acc / total

0.88476

In [106]:
flattened_test = np.array(mul_test_y).reshape(1,np.array(mul_test_y).shape[0]*np.array(mul_test_y).shape[1])[0]
flattened_pred = np.array(predictions).reshape(1,np.array(predictions).shape[0]*np.array(predictions).shape[1])[0]
print(recall_score(flattened_test,flattened_pred))
print(precision_score(flattened_test,flattened_pred))
print(f1_score(flattened_test,flattened_pred))

0.69283875811
0.81593571406
0.749365657034


In [48]:
# previous mlp only description
# acc = 87.083
# flattened_val = np.array(mul_val_y).reshape(1,np.array(mul_val_y).shape[0]*np.array(mul_val_y).shape[1])[0]
# flattened_pred = np.array(predictions).reshape(1,np.array(predictions).shape[0]*np.array(predictions).shape[1])[0]
# print(recall_score(flattened_val,flattened_pred))
# print(precision_score(flattened_val,flattened_pred))
# print(f1_score(flattened_val,flattened_pred))

0.633923865894
0.800911698422
0.707700663031
