In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline   

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

# import libraries for model validation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [40]:
# 1.Read the books dataset and explore it
# Import the dataset with correct encoding
books = pd.read_csv("BX-Books.csv", encoding='iso-8859-1')

# Print the columns of the DataFrame
print(books.dtypes)

# Identify the shape of the dataset
print(books.shape)

# Visualize the dataset
books.head()

isbn                   object
book_title             object
book_author            object
year_of_publication    object
publisher              object
dtype: object
(271379, 5)


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [41]:
# Remove non-numeric instances in year_of_publication column
def editYear(year):
    outputStr = year
    if not str(year).isdigit():
        print('Non-numeric year : ' + year)
        outputStr = 0
            
    return outputStr

books['year_of_publication'] = books['year_of_publication'].apply(editYear)
books['year_of_publication'] = pd.to_numeric(books['year_of_publication'])

Non-numeric year : John Peterman
Non-numeric year : \"Freedom Song\""
Non-numeric year : Frank Muir
Non-numeric year : ROBERT A. WILSON
Non-numeric year : Karen T. Whittenburg
Non-numeric year : George H. Scherr
Non-numeric year : Salvador de Madariaga
Non-numeric year : K.C. Constantine
Non-numeric year : Stan Berenstain
Non-numeric year : Francine Pascal
Non-numeric year : Luella Hill
Non-numeric year : John Alderson Foote
Non-numeric year : DK Publishing Inc
Non-numeric year : Jules Janin
Non-numeric year : Gallimard
Non-numeric year : DK Publishing Inc
Non-numeric year :  &amp
Non-numeric year :  Learning"
Non-numeric year : Isadora Duncan
Non-numeric year : Beatrix Potter
Non-numeric year : Bart Rulon
Non-numeric year : Alan Rich


In [42]:
# 2.Clean up NaN values

print('Before : ' + str(books.shape))
# Identify the variables with null values
books.isna().any()

# Missing value treatment
# Remove the records having NaN values are null

books = books.dropna(subset=['book_author','publisher'])
books = books.reset_index(drop=True)

print('After : ' + str(books.shape))




Before : (271379, 5)
After : (271376, 5)


In [43]:
# Import and Process the Users table

users = pd.read_csv("BX-Users.csv", encoding='iso-8859-1')

print('Before : ' + str(users.shape))
# Dropping Age column since almost half the netries here are Nan
# Replacing the NaN values wwith average value and using it to train
# the classifies would be pointless

users = users.drop(['Age'], axis=1)

# Removing Nan Location values from table
users.dropna(subset=['Location'])
users = users.reset_index(drop=True)

print('After : ' + str(users.shape))


Before : (278859, 3)
After : (278859, 2)


  interactivity=interactivity, compiler=compiler, result=result)


In [44]:
# 3.Read the data where ratings are given by users

# Import the dataset
book_ratings = pd.read_csv("BX-Book-Ratings.csv", encoding='iso-8859-1')

# Print the columns of the DataFrame
book_ratings.dtypes

# Identify the shape of the dataset
print(str(book_ratings.shape))

# Cleaning up NaN value entries
book_ratings.dropna()
book_ratings = book_ratings.reset_index(drop=True)

# Visualize the dataset
book_ratings.head()

(1048575, 3)


Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [45]:
# 4. Take a quick look at the number of unique users and books

user_id_arr = book_ratings.user_id.unique()
print('User_id : ' +  str(user_id_arr.shape))
book_isbn_arr = book_ratings.isbn.unique()
print('ISBN : ' + str(book_isbn_arr.shape))

User_id : (95513,)
ISBN : (322102,)


In [46]:
# 5. Convert ISBN variables to numeric numbers in the correct order

# Function to remove non-numeric characters from ISBM numbers
def convertToNumeric(inputStr):
    outputStr = ''
    for i in inputStr:
        if i.isdigit():
            outputStr += i
            
    return outputStr

print(book_ratings.isbn.unique()[-5:])
book_ratings['isbn'] = book_ratings['isbn'].apply(convertToNumeric)
print(book_ratings.isbn.unique()[-5:])

['440106575' '451157516' '048623715X' '486256588' '515069434']
['440106575' '451157516' '048623715' '486256588' '515069434']


In [47]:
# 6. Convert the user_id variable to numeric numbers in the correct order

book_ratings["user_id"] = pd.to_numeric(book_ratings["user_id"])


In [48]:
# 7. Convert both user_id and ISBN to the ordered list, i.e., from 0...n-1

book_ratings.head()

book_ratings.sort_values(by=['isbn'])
book_ratings.sort_values(by=['user_id'])

book_ratings.user_id.value_counts()

11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
94504         1
59675         1
41234         1
45509         1
59078         1
Name: user_id, Length: 95513, dtype: int64

In [49]:
# 8. Re-index the columns to build a matrix

# Merge book_ratings and users dataframe based on user_id column
# based on the user_ids present in book_ratings dataframe
final_book_df = pd.merge(book_ratings,users,on=['user_id'],how="inner")

# Merge book_ratings and books dataframe based on isbn column
# based on the isbn present in book_ratings dataframe
final_book_df = pd.merge(final_book_df,books,on=['isbn'],how="inner")

final_book_df.head()

Unnamed: 0,user_id,isbn,rating,Location,book_title,book_author,year_of_publication,publisher
0,2,195153448,0,"stockton, california, usa",Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,8,2005018,5,"timmins, ontario, canada",Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,11400,2005018,0,"ottawa, ontario, canada",Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
3,11676,2005018,8,"n/a, n/a, n/a",Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
4,41385,2005018,0,"sudbury, ontario, canada",Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada


In [50]:
final_book_df.isna().any()
print(str(final_book_df.shape))

# Dropping user_id, isbn and book title columns since they
# do not play any role in training the classifier
final_book_df = final_book_df.drop(['user_id','isbn','book_title'], axis=1)
print(str(final_book_df.shape))

(855865, 8)
(855865, 5)


In [51]:
def editRating(num):
    output = 0
    
    if num > 5:
        output = 1
    else:
        output = 0
        
    return output
print(final_book_df.iloc[5])
final_book_df['rating'] = final_book_df['rating'].apply(editRating)
print(final_book_df.iloc[5])

rating                                        8
Location               toronto, ontario, canada
book_author                Richard Bruce Wright
year_of_publication                        2001
publisher                 HarperFlamingo Canada
Name: 5, dtype: object
rating                                        1
Location               toronto, ontario, canada
book_author                Richard Bruce Wright
year_of_publication                        2001
publisher                 HarperFlamingo Canada
Name: 5, dtype: object


In [52]:
# Coverting all values of locations to numeric
# for Logistic regression
arr1 = final_book_df.Location.unique()
ht1 = {}
for i in range(len(arr1)):
    ht1[arr1[i]] = i
def editLocation(loc):
    return ht1[loc]
final_book_df['Location'] = final_book_df['Location'].apply(editLocation)

In [53]:
# Coverting all values of Book suthors to numeric
# for Logistic regression
arr2 = final_book_df.book_author.unique()
ht2 = {}
for i in range(len(arr2)):
    ht2[arr2[i]] = i
def editAuthor(author):
    return ht2[author]
final_book_df['book_author'] = final_book_df['book_author'].apply(editAuthor)

In [54]:
# Coverting all values of publishers to numeric
# for Logistic regression
arr3 = final_book_df.publisher.unique()
ht3 = {}
for i in range(len(arr3)):
    ht3[arr3[i]] = i
def editPublisher(publisher):
    return ht3[publisher]
final_book_df['publisher'] = final_book_df['publisher'].apply(editPublisher)

In [55]:
final_book_df.dtypes

rating                 int64
Location               int64
book_author            int64
year_of_publication    int64
publisher              int64
dtype: object

In [31]:
# 9. Split your data into two sets (training and testing)

# Collecting features which are useful for training classifier
used_features =[
    "Location",
    "book_author",
    "year_of_publication",
    "publisher"
]

X = final_book_df[used_features].values
y = final_book_df['rating']

# Split dataset in training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=11)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)



(599105, 4)
(256760, 4)
(599105,)
(256760,)


In [32]:
# 10. Make predictions based on user and item variables

# Instantiate the classifier
LogReg = LogisticRegression()

# Train classifier
LogReg.fit(X_train, y_train)

y_pred = LogReg.predict(X_test)


In [33]:
# 11. Use RMSE to evaluate the predictions

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))

[[171906   4123]
 [ 77620   3111]]
0.6816365477488705
0.31836345225112944
