In [2]:
import pandas as pd
import numpy as np

In [3]:
library = pd.read_csv('db_csvs/library.csv', sep="\t")

In [4]:
book_subjects = pd.read_csv('db_csvs/book_subjects.csv', sep="\t")

common_subjects = pd.read_csv('db_csvs/most_used_subjects.csv', sep="\t")

all_subjects = pd.read_csv('db_csvs/subjects.csv', sep="\t")
all_subjects = all_subjects.drop('source', 1)


# Create subject lists for each title

In [5]:
# merge subject names into book_subjects; drop uninteresting subjects
book_subjects = book_subjects.merge(all_subjects, how='left', on='subject_id')

delete_values = ["protected daisy", "accessible book", "in library", "overdrive", "large type books", 'ficci\xc3\xb3n juvenil', 'ficci\xc3\xb3n', 'lending library']
book_subjects = book_subjects[~book_subjects['subject'].isin(delete_values)]


In [6]:
# group by book ids
book_lists = book_subjects.groupby('book_id')
book_lists.get_group(8)

Unnamed: 0,booksubject_id,book_id,subject_id,subject
52,53,8,5617,social life and customs
54,55,8,6228,industrial revolution
55,56,8,7143,problèmes sociaux
56,57,8,7144,conditions sociales
58,59,8,5691,married people
59,60,8,5495,fiction
60,61,8,5588,romans
61,62,8,7141,utilitarianism
62,63,8,5945,social problems
63,64,8,7145,mœurs et coutumes


In [7]:
books = book_subjects['book_id']
books_with_subject_lists = {}
for book_id in books:
    subjects = []
    for subj in book_lists.get_group(book_id)['subject']:
        if books_with_subject_lists.get(book_id):
            books_with_subject_lists[book_id].append(subj)
        else:
            books_with_subject_lists[book_id] = [subj]


In [9]:
books_with_subjects = pd.DataFrame(books_with_subject_lists.items())
books_with_subjects.columns = ['book_id', 'subjects']
books_with_subjects.head()

Unnamed: 0,book_id,subjects
0,1,"[princes, narnia , juvenile fiction, fiction, ..."
1,2,"[united states civil war, scarlett o'hara , po..."
2,3,"[magic, juvenile fiction, fiction, coming of a..."
3,5,"[married women, fiction, married women, fiction]"
4,7,"[data encryption , literature, cryptography, f..."


## Get ratings for each book in library for this user

In [10]:
user_ratings = pd.read_csv('db_csvs/user_books.csv', sep='\t')
library = library.merge(user_ratings, how='left', on='book_id')

In [11]:
titles_df = library.drop(['openlib_bid', 'google_bid', 'img_url', 'goodreads_url', 'get_subjects', 'preview', 'isbn', 'goodreads_bid', 'gr_shelf_id', "source"], 1)

In [12]:
titles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1568 entries, 0 to 1567
Data columns (total 13 columns):
book_id              1568 non-null int64
title                1568 non-null object
author               1568 non-null object
pub_year             1461 non-null float64
original_pub_year    1559 non-null float64
pages                1500 non-null float64
publisher            1462 non-null object
language             1199 non-null object
userbook_id          356 non-null float64
user_id              356 non-null float64
gr_shelf_name        356 non-null object
status               356 non-null object
rating               356 non-null float64
dtypes: float64(6), int64(1), object(6)
memory usage: 171.5+ KB


# Get common subjects as columns in library DF

In [13]:
# merge subject lists into library as single column
book_attributes = titles_df.merge(books_with_subjects, how='left', on='book_id')

In [14]:
# Clean deleted subjects out of common_subjects DF
delete_values = ["protected daisy", "accessible book", "in library", "overdrive", "large type books", 'ficci\xc3\xb3n juvenil', 'ficci\xc3\xb3n', 'lending library']
common_subjects = common_subjects[~common_subjects['subject'].isin(delete_values)]

In [15]:
subj_columns = list(common_subjects['subject'])

In [16]:
book_attributes['subjects'] = book_attributes['subjects'].fillna("")

In [17]:
for subject in subj_columns:
    subj_col = []
    for x in book_attributes['subjects']:
        if subject in x:
            subj_col.append(1)
        else:
            subj_col.append(0)
    book_attributes[subject] = subj_col

In [18]:
book_attributes.head()

Unnamed: 0,book_id,title,author,pub_year,original_pub_year,pages,publisher,language,userbook_id,user_id,...,clergy,mothers and daughters,german fiction,artists,english literature,books and reading,united states,authorship,criminals,mystery
0,771,Dirk Gently’s Holistic Detective Agency,Douglas Adams,1991,1987,306,Pocket Books,eng,,,...,0,0,0,0,0,0,0,0,0,0
1,772,The Long Dark Teatime of the Soul,Douglas Adams,1991,1988,307,Pocket Books,en-US,,,...,0,0,0,0,0,0,0,0,0,0
2,773,Aesop’s Fables,Aesop,2003,-560,306,Oxford University Press,eng,,,...,0,0,0,0,0,0,0,0,0,0
3,774,Novel With Cocaine,M. Ageyev,1998,1934,204,Northwestern University Press,eng,,,...,0,0,0,0,0,0,0,0,0,0
4,775,In The Heart of the Seas,Nathaniel Philbrick,2001,1999,238,Penguin Books,en-US,,,...,0,0,0,0,0,0,0,0,0,0


## Turn publication years into usable data

In [20]:
original_pub_buckets = []
for item in book_attributes['original_pub_year']:
    if item > 1950:
        original_pub_buckets.append('1950+')
    elif item > 1900:
        original_pub_buckets.append('1901-50')
    elif item > 1850:
        original_pub_buckets.append('1851-1900')
    elif item > 1800:
        original_pub_buckets.append('1851-1900')      
    elif item > 1700:
        original_pub_buckets.append('1701-1800')      
    elif item > 1500:
        original_pub_buckets.append('1501-1700')  
    else:
        original_pub_buckets.append('Unknown')    
book_attributes.insert(loc=5, column='orig_pub_year_cat', value=original_pub_buckets)

In [49]:
pub_buckets = []
for item in book_attributes['pub_year']:
    if item > 1950:
        pub_buckets.append('1950+')
    elif item > 1900:
        pub_buckets.append('1901-50')
    elif item > 1850:
        pub_buckets.append('1851-1900')
    elif item > 1800:
        pub_buckets.append('1851-1900')      
    elif item > 1700:
        pub_buckets.append('1701-1800')      
    elif item > 1500:
        pub_buckets.append('1501-1700')  
    else:
        pub_buckets.append('Unknown')   
book_attributes.insert(loc=5, column='pub_year_cat', value=pub_buckets)


In [55]:
#get dummies for pub_year_cat and orig_pub_year_cat
pub_year_cat_dummies = pd.get_dummies(book_attributes['pub_year_cat'])
orig_year_dummies = pd.get_dummies(book_attributes['orig_pub_year_cat'])

In [59]:
book_full_attr = book_attributes.merge(pub_year_cat_dummies,left_index=True, right_index=True)

In [66]:
book_full_attr = book_full_attr.merge(orig_year_dummies,left_index=True, right_index=True)

In [69]:
book_full_attr.columns

Index([u'book_id', u'title', u'author', u'pub_year', u'original_pub_year',
       u'pub_year_cat', u'orig_pub_year_cat', u'pages', u'publisher',
       u'language',
       ...
       u'mystery', u'1901-50_x', u'1950+_x', u'Unknown_x', u'1501-1700',
       u'1701-1800', u'1851-1900', u'1901-50_y', u'1950+_y', u'Unknown_y'],
      dtype='object', length=119)

In [71]:
book_full_attr = book_full_attr.drop(['title', 'author', 'pub_year', 'original_pub_year', 'pub_year_cat', 'orig_pub_year_cat', 'publisher', 'language'], 1)
book_full_attr = book_full_attr.drop(['userbook_id', 'user_id', 'gr_shelf_name'], 1)

In [74]:
book_full_attr.columns

Index([u'book_id', u'pages', u'status', u'rating', u'subjects', u'fiction',
       u'history', u'classic literature', u'social life and customs',
       u'literature',
       ...
       u'mystery', u'1901-50_x', u'1950+_x', u'Unknown_x', u'1501-1700',
       u'1701-1800', u'1851-1900', u'1901-50_y', u'1950+_y', u'Unknown_y'],
      dtype='object', length=108)

In [76]:
rating_list = book_full_attr['rating']
len(rating_list)

1568

In [77]:
book_full_attr = book_full_attr.drop('rating', 1)

In [78]:
book_full_attr.insert(loc=0, column='ratings', value=rating_list)

In [87]:
train = book_full_attr[np.isfinite(book_full_attr['ratings'])]

In [90]:
train.reset_index(inplace=True)

In [None]:
train = train.drop('subjects', 1)

In [105]:
unrated = train[train.ratings == 0]

In [113]:
rated = train[train.ratings != 0]

In [114]:
rated.head()

Unnamed: 0,index,ratings,book_id,pages,status,fiction,history,classic literature,social life and customs,literature,...,mystery,1901-50_x,1950+_x,Unknown_x,1501-1700,1701-1800,1851-1900,1901-50_y,1950+_y,Unknown_y
0,28,2,12,,read,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,29,4,15,320.0,read,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,52,2,16,384.0,read,1,0,1,1,0,...,0,0,1,0,0,0,1,0,0,0
3,53,3,17,256.0,read,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,54,2,18,209.0,read,1,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0


In [116]:
rated_ratings = rated['ratings']
rated = rated.drop(['ratings', 'status', 'index'], 1)

In [117]:
rated.insert(loc=0, column='ratings', value=rated_ratings)

## Random Forest

In [94]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix
import random

In [95]:
%matplotlib inline
from matplotlib import pylab, pyplot  # plotting

In [185]:
rows = random.sample(rated.index, 209)
rated_train = rated.ix[rows[:160]]
rated_validate = rated.ix[rows[160:]]

In [186]:
rated_train['pages'].fillna(0, inplace=True)
rated_validate['pages'].fillna(0, inplace=True)
actuals = rated_validate['ratings']

In [279]:
# translate specific ratings into recommend/do not recommend
def rating_categories(rating_list):
    ratings = []
    for x in rating_list:
        if x > 3:
            ratings.append(1)
        else:
            ratings.append(-1)
    return ratings

In [280]:
# translate actual ratings to recommend/do not recommend
actual_ratings = rating_categories(actuals)

In [281]:
# instantiate RF classifier; train on rated training data
rf = RandomForestClassifier(n_estimators=50)
rf.fit(rated_train.iloc[:,2:], rated_train.iloc[:, 0])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [282]:
# get predictions & translate to recommend/do not recommend categories
rf_predictions = rf.predict(rated_validate.iloc[:,2:])
rf_predicted_ratings = rating_categories(rf_predictions)

In [283]:
# accuracy calculation for algorithm comparison
def calculate_accuracy(validation_ratings, predicted_ratings):
    got = 0
    for x,y in zip(validation_ratings, predicted_ratings):
        if x==y:
            got +=1
    accuracy = got/float(len(validation_ratings))
    return accuracy

In [284]:
# calculate accuracy for predictions
rf_accuracy = calculate_accuracy(actual_ratings, rf_predicted_ratings)
print rf_accuracy

0.612244897959


In [285]:
from sklearn.neighbors import KNeighborsClassifier
neighbors = KNeighborsClassifier(n_neighbors=11)
neighbors.fit(rated_train.iloc[:,2:], rated_train.iloc[:, 0])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='uniform')

In [286]:
knn_predictions = neighbors.predict(rated_validate.iloc[:,2:])
knn_predicted_ratings = rating_categories(knn_predictions)

In [287]:
knn_accuracy = calculate_accuracy(actual_ratings, knn_predicted_ratings)
print knn_accuracy

0.591836734694


# Getting data back into my database

In [None]:
# save book_id with rating in a dictionary


## Confusion Matrix

## NEXT STEPS

In [None]:
# generic cross validation function
def cross_validate(features, target, classifier, k_fold) :

    # derive a set of (random) training and testing indices
    k_fold_indices = KFold(len(features), n_folds=k_fold,
                           shuffle=True, random_state=0)

    k_score_total = 0
    
    # for each training and testing slices run the classifier, and score the results
    for train_slice, test_slice in k_fold_indices :

        model = classifier(features[train_slice],
                           target[train_slice])

        k_score = model.score(features[test_slice],
                              target[test_slice])

        k_score_total += k_score

    # return the average accuracy
    return k_score_total/k_fold