In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [57]:
books = pd.read_csv('../data/raw/BX-Books.csv', sep=';',encoding='latin1', on_bad_lines='skip')
users = pd.read_csv('../data/raw/BX-Users.csv', sep=';',encoding='latin1', on_bad_lines='skip')
ratings = pd.read_csv('../data/raw/BX-Book-Ratings.csv', sep=';',encoding='latin1', on_bad_lines='skip')

In [58]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [59]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [60]:
books.rename(columns={
    'ISBN': 'isbn',
    'Book-Title': 'title', 
    'Book-Author': 'author', 
    'Year-Of-Publication':'year', 
    'Publisher': 'publisher',
    'Image-URL-L' : 'image_url'
}, inplace=True)

In [61]:
books.shape

(271360, 8)

In [62]:
books.isnull().sum()

isbn           0
title          0
author         2
year           0
publisher      2
Image-URL-S    0
Image-URL-M    0
image_url      3
dtype: int64

In [63]:
books.dropna(how='any', axis=0, inplace=True)

In [64]:
books.shape

(271353, 8)

In [65]:
books.duplicated().sum()

np.int64(0)

In [66]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [67]:
ratings.rename(columns={
    'User-ID':'user_id',
    'ISBN' : 'isbn',
    'Book-Rating' : 'rating'
}, inplace=True)

In [68]:
ratings.shape

(1149780, 3)

In [81]:
ratings.isnull().sum()

user_id    0
isbn       0
rating     0
dtype: int64

In [82]:
ratings.duplicated().sum()

np.int64(0)

In [69]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [70]:
users.rename(columns={
    'User-ID':'user_id',
    'Location': 'location',
    'Age': 'age'
}, inplace=True)

In [71]:
users.shape

(278858, 3)

In [72]:
users.isnull().sum()

user_id          0
location         0
age         110762
dtype: int64

In [73]:
users['age'].median()

np.float64(32.0)

In [74]:
users['age'].mode()

0    24.0
Name: age, dtype: float64

In [75]:
age_mode = int(users['age'].mode())
users['age'].fillna(age_mode, inplace=True)

In [76]:
users.isnull().sum()

user_id     0
location    0
age         0
dtype: int64

In [77]:
users['age'].min()

np.float64(0.0)

In [78]:
users['age'].max()

np.float64(244.0)

In [79]:
users.duplicated().sum()

np.int64(0)

In [80]:
users.shape

(278858, 3)

In [83]:
books.to_csv('../data/cleaned/books.csv', index=False)
users.to_csv('../data/cleaned/users.csv', index=False)
ratings.to_csv('../data/cleaned/ratings.csv', index=False)