# Analysis of datasets

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
books_df = pd.read_csv(r'../artifacts/data_ingestion/books.csv')
ratings_df = pd.read_csv(r'../artifacts/data_ingestion/ratings.csv')
users_df = pd.read_csv(r'../artifacts/data_ingestion/users.csv')

In [4]:
books_df.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [5]:
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [6]:
users_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [7]:
dfs_dict = {
    'books_df': books_df,
    'ratings_df': ratings_df,
    'users_df': users_df
}

In [10]:
print("-"*30,"shapes of DataFrames","-"*30)
for i in dfs_dict.keys():
    print(f"shape of {i}: ", dfs_dict[i].shape)

------------------------------ shapes of DataFrames ------------------------------
shape of books_df:  (271360, 6)
shape of ratings_df:  (1149780, 3)
shape of users_df:  (278858, 3)


In [17]:
print("-"*30,"Basic info about DataFrames","-"*30)
for i in dfs_dict.keys():
    print("\n")
    print(f"{'*'*30}info about {i}{'*'*30}")
    print(dfs_dict[i].info())

------------------------------ Basic info about DataFrames ------------------------------


******************************info about books_df******************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271358 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-L          271357 non-null  object
dtypes: object(6)
memory usage: 12.4+ MB
None


******************************info about ratings_df******************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       -------------- 

In [21]:
print(f"{'-'*50} Column names {'-'*50}")
for i in dfs_dict.keys():
    print("\n")
    print(f"{'*'*30} {i} {'*'*30}")
    print(list(dfs_dict[i].columns))

-------------------------------------------------- Column names --------------------------------------------------


****************************** books_df ******************************
['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-L']


****************************** ratings_df ******************************
['User-ID', 'ISBN', 'Book-Rating']


****************************** users_df ******************************
['User-ID', 'Location', 'Age']


In [22]:
print(f"{'-'*50} Unique Entries Counts {'-'*50}")
for i in dfs_dict.keys():
    print("\n")
    print(f"{'*'*30} {i} {'*'*30}")
    print(dfs_dict[i].nunique())

-------------------------------------------------- Unique Entries Counts --------------------------------------------------


****************************** books_df ******************************
ISBN                   271359
Book-Title             242135
Book-Author            102022
Year-Of-Publication       207
Publisher               16807
Image-URL-L            271041
dtype: int64


****************************** ratings_df ******************************
User-ID        105283
ISBN           340031
Book-Rating        11
dtype: int64


****************************** users_df ******************************
User-ID     278858
Location     57339
Age            165
dtype: int64


In [24]:
print(f"{'-'*50} Null entries {'-'*50}")
for i in dfs_dict.keys():
    print("\n")
    print(f"{'*'*30} {i} {'*'*30}")
    print(dfs_dict[i].isnull().sum())

-------------------------------------------------- Null entries --------------------------------------------------


****************************** books_df ******************************
ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-L            3
dtype: int64


****************************** ratings_df ******************************
User-ID        0
ISBN           0
Book-Rating    0
dtype: int64


****************************** users_df ******************************
User-ID          0
Location         0
Age         110762
dtype: int64


In [26]:
print(f"{'-'*50} Droping Null Entries {'-'*50}")
for i in dfs_dict.keys():
    print("\n")
    print(f"{'*'*30} {i} {'*'*30}")
    print("Dataframe befor droping null values")
    print(dfs_dict[i].isnull().sum())
    dfs_dict[i].dropna(axis=0, inplace=True)
    print("Dataframe after droping null values")
    print(dfs_dict[i].isnull().sum())

-------------------------------------------------- Droping Null Entries --------------------------------------------------


****************************** books_df ******************************
Dataframe befor droping null values
ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-L            3
dtype: int64
Dataframe after droping null values
ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
Image-URL-L            0
dtype: int64


****************************** ratings_df ******************************
Dataframe befor droping null values
User-ID        0
ISBN           0
Book-Rating    0
dtype: int64
Dataframe after droping null values
User-ID        0
ISBN           0
Book-Rating    0
dtype: int64


****************************** users_df ******************************
Dataframe befor droping null values
User-ID          0
Loc

In [31]:
print(f"{'-'*50} Duplicate Rows {'-'*50}")
for i in dfs_dict.keys():
    print("\n")
    print(f"{'*'*30} {i} {'*'*30}")
    print(f"Duplicate rows: {dfs_dict[i].duplicated().sum()}")

-------------------------------------------------- Duplicate Rows --------------------------------------------------


****************************** books_df ******************************
Duplicate rows: 1


****************************** ratings_df ******************************
Duplicate rows: 26


****************************** users_df ******************************
Duplicate rows: 0


In [32]:
print(f"{'-'*50} Droping Duplicate rows {'-'*50}")
for i in dfs_dict.keys():
    print("\n")
    print(f"{'*'*30} {i} {'*'*30}")
    dfs_dict[i].drop_duplicates(inplace=True)
    print("After droping duplicate rows")
    print(f"duplicate rows: {dfs_dict[i].duplicated().sum()}")

-------------------------------------------------- Droping Duplicate rows --------------------------------------------------


****************************** books_df ******************************
After droping duplicate rows
duplicate rows: 0


****************************** ratings_df ******************************
After droping duplicate rows
duplicate rows: 0


****************************** users_df ******************************
After droping duplicate rows
duplicate rows: 0


In [34]:
print("-"*30,"Datasets, After droping null values and duplicate rows","-"*30)
for i in dfs_dict.keys():
    print(f"shape of {i}: ", dfs_dict[i].shape)

------------------------------ Datasets, After droping null values and duplicate rows ------------------------------
shape of books_df:  (271352, 6)
shape of ratings_df:  (1149754, 3)
shape of users_df:  (168096, 3)
