In [149]:
import pandas as pd
import numpy as np

In [96]:
library = pd.read_csv('db_csvs/library.csv', sep="\t")

In [35]:
book_subjects = pd.read_csv('db_csvs/book_subjects.csv', sep="\t")

common_subjects = pd.read_csv('db_csvs/most_used_subjects.csv', sep="\t")

all_subjects = pd.read_csv('db_csvs/subjects.csv', sep="\t")
all_subjects = all_subjects.drop('source', 1)


# Create subject lists for each title

In [36]:
# merge subject names into book_subjects; drop uninteresting subjects
book_subjects = book_subjects.merge(all_subjects, how='left', on='subject_id')

delete_values = ["protected daisy", "accessible book", "in library", "overdrive", "large type books", 'ficci\xc3\xb3n juvenil', 'ficci\xc3\xb3n', 'lending library']
book_subjects = book_subjects[~book_subjects['subject'].isin(delete_values)]


In [37]:
# group by book ids
book_lists = book_subjects.groupby('book_id')
book_lists.get_group(8)

Unnamed: 0,booksubject_id,book_id,subject_id,subject
52,53,8,5617,social life and customs
54,55,8,6228,industrial revolution
55,56,8,7143,problèmes sociaux
56,57,8,7144,conditions sociales
58,59,8,5691,married people
59,60,8,5495,fiction
60,61,8,5588,romans
61,62,8,7141,utilitarianism
62,63,8,5945,social problems
63,64,8,7145,mœurs et coutumes


In [70]:
books = book_subjects['book_id']
books_with_subject_lists = {}
for book_id in books:
    subjects = []
    for subj in book_lists.get_group(book_id)['subject']:
        if books_with_subject_lists.get(book_id):
            books_with_subject_lists[book_id].append(subj)
        else:
            books_with_subject_lists[book_id] = [subj]


In [74]:
books_with_subjects = pd.DataFrame(books_with_subject_lists.items())
books_with_subjects.columns = ['book_id', 'subjects']

In [79]:
books_with_subjects.head()

Unnamed: 0,book_id,subjects
0,1,"[princes, narnia , juvenile fiction, fiction, ..."
1,2,"[united states civil war, scarlett o'hara , po..."
2,3,"[magic, juvenile fiction, fiction, coming of a..."
3,5,"[married women, fiction, married women, fiction]"
4,7,"[data encryption , literature, cryptography, f..."


## Get ratings for each book in library for this user

In [97]:
user_ratings = pd.read_csv('db_csvs/user_books.csv', sep='\t')
library = library.merge(user_ratings, how='left', on='book_id')

In [99]:
titles_df = library.drop(['openlib_bid', 'google_bid', 'img_url', 'goodreads_url', 'get_subjects', 'preview', 'isbn', 'goodreads_bid', 'gr_shelf_id', "source"], 1)

In [100]:
titles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1568 entries, 0 to 1567
Data columns (total 13 columns):
book_id              1568 non-null int64
title                1568 non-null object
author               1568 non-null object
pub_year             1461 non-null float64
original_pub_year    1559 non-null float64
pages                1500 non-null float64
publisher            1462 non-null object
language             1199 non-null object
userbook_id          356 non-null float64
user_id              356 non-null float64
gr_shelf_name        356 non-null object
status               356 non-null object
rating               356 non-null float64
dtypes: float64(6), int64(1), object(6)
memory usage: 171.5+ KB


# Get common subjects as columns in library DF

In [127]:
# merge subject lists into library as single column
book_attributes = titles_df.merge(books_with_subjects, how='left', on='book_id')

In [102]:
# Clean deleted subjects out of common_subjects DF
delete_values = ["protected daisy", "accessible book", "in library", "overdrive", "large type books", 'ficci\xc3\xb3n juvenil', 'ficci\xc3\xb3n', 'lending library']
common_subjects = common_subjects[~common_subjects['subject'].isin(delete_values)]

In [126]:
subj_columns = list(common_subjects['subject'])

In [124]:
# gets all columns in as empty columns
for subj in subj_columns:
    book_attributes[subj] = 0

In [128]:
book_attributes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1568 entries, 0 to 1567
Data columns (total 14 columns):
book_id              1568 non-null int64
title                1568 non-null object
author               1568 non-null object
pub_year             1461 non-null float64
original_pub_year    1559 non-null float64
pages                1500 non-null float64
publisher            1462 non-null object
language             1199 non-null object
userbook_id          356 non-null float64
user_id              356 non-null float64
gr_shelf_name        356 non-null object
status               356 non-null object
rating               356 non-null float64
subjects             1104 non-null object
dtypes: float64(6), int64(1), object(7)
memory usage: 183.8+ KB


In [135]:
book_attributes['subjects'] = book_attributes['subjects'].fillna("")

In [136]:
book_attributes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1568 entries, 0 to 1567
Data columns (total 14 columns):
book_id              1568 non-null int64
title                1568 non-null object
author               1568 non-null object
pub_year             1461 non-null float64
original_pub_year    1559 non-null float64
pages                1500 non-null float64
publisher            1462 non-null object
language             1199 non-null object
userbook_id          356 non-null float64
user_id              356 non-null float64
gr_shelf_name        356 non-null object
status               356 non-null object
rating               356 non-null float64
subjects             1568 non-null object
dtypes: float64(6), int64(1), object(7)
memory usage: 183.8+ KB


In [137]:
for subject in subj_columns:
    subj_col = []
    for x in book_attributes['subjects']:
        if subject in x:
            subj_col.append(1)
        else:
            subj_col.append(0)
    book_attributes[subject] = subj_col

In [139]:
book_attributes.head()

Unnamed: 0,book_id,title,author,pub_year,original_pub_year,pages,publisher,language,userbook_id,user_id,...,clergy,mothers and daughters,german fiction,artists,english literature,books and reading,united states,authorship,criminals,mystery
0,771,Dirk Gently’s Holistic Detective Agency,Douglas Adams,1991,1987,306,Pocket Books,eng,,,...,0,0,0,0,0,0,0,0,0,0
1,772,The Long Dark Teatime of the Soul,Douglas Adams,1991,1988,307,Pocket Books,en-US,,,...,0,0,0,0,0,0,0,0,0,0
2,773,Aesop’s Fables,Aesop,2003,-560,306,Oxford University Press,eng,,,...,0,0,0,0,0,0,0,0,0,0
3,774,Novel With Cocaine,M. Ageyev,1998,1934,204,Northwestern University Press,eng,,,...,0,0,0,0,0,0,0,0,0,0
4,775,In The Heart of the Seas,Nathaniel Philbrick,2001,1999,238,Penguin Books,en-US,,,...,0,0,0,0,0,0,0,0,0,0


In [143]:
book_attributes.describe()

Unnamed: 0,book_id,pub_year,original_pub_year,pages,userbook_id,user_id,rating,fiction,history,classic literature,...,clergy,mothers and daughters,german fiction,artists,english literature,books and reading,united states,authorship,criminals,mystery
count,1568.0,1461.0,1559.0,1500.0,356.0,356,356.0,1568.0,1568.0,1568.0,...,1568.0,1568.0,1568.0,1568.0,1568.0,1568.0,1568.0,1568.0,1568.0,1568.0
mean,944.61352,1999.930185,1944.608724,355.053333,178.5,1,2.202247,0.561224,0.094388,0.080357,...,0.007015,0.007015,0.007015,0.007015,0.007015,0.007015,0.007015,0.007015,0.007015,0.007015
std,542.617955,9.355347,144.385816,219.148844,102.912584,0,2.060656,0.496396,0.292461,0.271932,...,0.08349,0.08349,0.08349,0.08349,0.08349,0.08349,0.08349,0.08349,0.08349,0.08349
min,1.0,1950.0,-800.0,0.0,1.0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,440.75,1996.0,1932.0,218.0,89.75,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1018.5,2002.0,1974.0,311.0,178.5,1,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1410.25,2006.0,1997.0,430.25,267.25,1,4.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1802.0,2017.0,2015.0,2549.0,356.0,1,5.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Turn publication years into usable data

1947.9800899165061

## Random Forest

In [153]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold

In [176]:
# isolating rated books
rated_books = book_attributes[book_attributes.rating>0]
rated_books['original_pub_year'].mean()

1975.9473684210527

In [154]:
# generic cross validation function
def cross_validate(features, target, classifier, k_fold) :

    # derive a set of (random) training and testing indices
    k_fold_indices = KFold(len(features), n_folds=k_fold,
                           shuffle=True, random_state=0)

    k_score_total = 0
    
    # for each training and testing slices run the classifier, and score the results
    for train_slice, test_slice in k_fold_indices :

        model = classifier(features[train_slice],
                           target[train_slice])

        k_score = model.score(features[test_slice],
                              target[test_slice])

        k_score_total += k_score

    # return the average accuracy
    return k_score_total/k_fold

In [161]:
cols = book_attributes.columns
for x in cols:
    print "'"+x+"', "

'book_id', 
'title', 
'author', 
'pub_year', 
'original_pub_year', 
'pages', 
'publisher', 
'language', 
'userbook_id', 
'user_id', 
'gr_shelf_name', 
'status', 
'rating', 
'subjects', 
'fiction', 
'history', 
'classic literature', 
'social life and customs', 
'literature', 
'fiction in english', 
'popular print disabled books', 
'juvenile fiction', 
'women', 
'translations into english', 
'biography', 
'world war', 
'new york times bestseller', 
'young women', 
'drama', 
'science fiction', 
'social conditions', 
'british', 
'young men', 
'readers', 
'fantasy', 
'historical fiction', 
'english fiction', 
'man-woman relationships', 
'family', 
'friendship', 
'fantasy fiction', 
'juvenile literature', 
'married women', 
'triangles ', 
'england', 
'manuscripts', 
'married people', 
'detective and mystery stories', 
'orphans', 
'americans', 
'fathers and daughters', 
'nonfiction', 
'translations into russian', 
'travel', 
'psychological fiction', 
'romance', 
'murder', 
'domestic fiction',

In [None]:
features = [
'title', 
'author', 
'original_pub_year', 
'pages', 
'publisher', 
'language', 
'gr_shelf_name', 
'fiction', 
'history', 
'classic literature', 
'social life and customs', 
'literature', 
'fiction in english', 
'popular print disabled books', 
'juvenile fiction', 
'women', 
'translations into english', 
'biography', 
'world war', 
'new york times bestseller', 
'young women', 
'drama', 
'science fiction', 
'social conditions', 
'british', 
'young men', 
'readers', 
'fantasy', 
'historical fiction', 
'english fiction', 
'man-woman relationships', 
'family', 
'friendship', 
'fantasy fiction', 
'juvenile literature', 
'married women', 
'triangles ', 
'england', 
'manuscripts', 
'married people', 
'detective and mystery stories', 
'orphans', 
'americans', 
'fathers and daughters', 
'nonfiction', 
'translations into russian', 
'travel', 
'psychological fiction', 
'romance', 
'murder', 
'domestic fiction', 
'facsimiles', 
'families', 
'sisters', 
'social classes', 
'authors', 
'english language', 
'gay men', 
'criticism and interpretation', 
'race relations', 
'description and travel', 
'history and criticism', 
'man booker prize winner', 
'award:man_booker_prize', 
'city and town life', 
'death', 
'romans', 
'male friendship', 
'boys', 
'private investigators', 
'inheritance and succession', 
'jews', 
'country homes', 
'children's stories', 
'fathers and sons', 
'thriller', 
'adaptations', 
'french fiction', 
'comic books', 
'psychology', 
'adventure stories', 
'immigrants', 
'adultery', 
'specimens', 
'interpersonal relations', 
'national book critics circle award winner', 
'politics and government', 
'african americans', 
'award:national_book_critics_circle_award', 
'women authors', 
'adventure and adventurers', 
'clergy', 
'mothers and daughters', 
'german fiction', 
'artists', 
'english literature', 
'books and reading', 
'united states', 
'authorship', 
'criminals', 
'mystery']