# Gathering the data

In [1]:
import numpy as np
import pandas as pd

In [3]:
# importing our data
amazon_bs_books = pd.read_excel ("Amazon Best Selling Books (2019 - 2021).xlsx")

# looking at the information in our data
# amazon_bs_books

#first five rows in our data
amazon_bs_books.head()

Unnamed: 0.1,Unnamed: 0,price,ranks,title,no_of_reviews,ratings,author,cover_type,year,genre
0,0,12.49,1.0,The Lost Symbol,16118,4.4,Dan Brown,Hardcover,2009,Fiction
1,1,13.4,2.0,The Shack: Where Tragedy Confronts Eternity,23392,4.7,William P. Young,Paperback,2009,Fiction
2,2,9.93,3.0,Liberty and Tyranny: A Conservative Manifesto,5036,4.8,Mark R. Levin,Hardcover,2009,Non Fiction
3,3,14.3,4.0,"Breaking Dawn (The Twilight Saga, Book 4)",16912,4.7,Stephenie Meyer,Hardcover,2009,Fiction
4,4,9.99,5.0,Going Rogue: An American Life,1572,4.6,Sarah Palin,Hardcover,2009,Non Fiction


In [5]:
# the last five rows in our data
amazon_bs_books.tail()

Unnamed: 0.1,Unnamed: 0,price,ranks,title,no_of_reviews,ratings,author,cover_type,year,genre
1286,1286,16.69,96.0,Will,Will Smith,4.8,,Hardcover,2021,unknown
1287,1287,7.49,97.0,Think and Grow Rich: The Landmark Bestseller N...,79872,4.7,Napoleon Hill,Paperback,2021,unknown
1288,1288,8.95,98.0,Dragons Love Tacos,15753,4.8,Adam Rubin,Hardcover,2021,unknown
1289,1289,7.59,99.0,The Truth About COVID-19: Exposing The Great R...,Doctor Joseph Mercola,4.8,,Hardcover,2021,unknown
1290,1290,13.29,100.0,First Little Readers Parent Pack: Guided Readi...,27332,4.7,Deborah Schecter,Paperback,2021,unknown


# Accessing the Amazon data

In [9]:
# checking the number of rows and columns
amazon_bs_books.shape

(1291, 10)

In [11]:
# accessing the features or columns of our dataset to get a concise summary of our dataset
amazon_bs_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1291 entries, 0 to 1290
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     1291 non-null   int64  
 1   price          1287 non-null   float64
 2   ranks          1287 non-null   float64
 3   title          1284 non-null   object 
 4   no_of_reviews  1285 non-null   object 
 5   ratings        1287 non-null   float64
 6   author         1268 non-null   object 
 7   cover_type     1286 non-null   object 
 8   year           1291 non-null   int64  
 9   genre          1291 non-null   object 
dtypes: float64(3), int64(2), object(5)
memory usage: 101.0+ KB


In [13]:
# getting the statistical description of our data
amazon_bs_books.describe()

Unnamed: 0.1,Unnamed: 0,price,ranks,ratings,year
count,1291.0,1287.0,1287.0,1287.0,1291.0
mean,645.0,13.757117,50.246309,4.650894,2015.027111
std,372.823908,10.046391,28.803308,0.197174,3.734404
min,0.0,1.17,1.0,3.4,2009.0
25%,322.5,8.55,25.0,4.6,2012.0
50%,645.0,12.1,50.0,4.7,2015.0
75%,967.5,16.29,75.0,4.8,2018.0
max,1290.0,144.0,100.0,4.9,2021.0


In [14]:
# looking at ramdom samples in our dataset to get more information
amazon_bs_books.sample(5)

Unnamed: 0.1,Unnamed: 0,price,ranks,title,no_of_reviews,ratings,author,cover_type,year,genre
127,127,11.49,31.0,Three Cups of Tea: One Man's Mission to Promot...,3497,4.4,Greg Mortenson,Paperback,2010,Non Fiction
1256,1256,16.75,66.0,Think Again: The Power of Knowing What You Don...,12028,4.6,Adam Grant,Hardcover,2021,unknown
59,59,20.58,62.0,The Host,9715,4.6,Stephenie Meyer,Hardcover,2009,Fiction
617,617,10.5,26.0,How to Win Friends & Influence People,79094,4.7,Dale Carnegie,Paperback,2015,Non Fiction
838,838,8.99,47.0,Dog Man: A Tale of Two Kitties: From the Creat...,16127,4.9,Dav Pilkey,Hardcover,2017,Fiction


In [15]:
# checking if there are missing values in our data
amazon_bs_books.isnull()

Unnamed: 0.1,Unnamed: 0,price,ranks,title,no_of_reviews,ratings,author,cover_type,year,genre
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
1286,False,False,False,False,False,False,True,False,False,False
1287,False,False,False,False,False,False,False,False,False,False
1288,False,False,False,False,False,False,False,False,False,False
1289,False,False,False,False,False,False,True,False,False,False


In [16]:
#critically checking if there are missing values in our data
amazon_bs_books.isnull().sum()

Unnamed: 0        0
price             4
ranks             4
title             7
no_of_reviews     6
ratings           4
author           23
cover_type        5
year              0
genre             0
dtype: int64

# Cleaning the Amazon data

In [20]:
# filling missing values in 'title" column with 'unknown'
amazon_bs_books['title'].fillna('Unknown', inplace=True)

#filling missing values in 'author' columnwith 'Unknown'
amazon_bs_books['author'].fillna('Unknown', inplace=True)

#filling the missing numbers in 'no of reviews' column with 'Unknown'
amazon_bs_books['no_of_reviews'].fillna('Unknown', inplace=True)

# Filling missing values in 'ranks' with -1 to indicate absence of rank
amazon_bs_books['ranks'].fillna(-1, inplace=True)

# Filling missing values in 'cover_type' with the mode of the available data i.e the mst common cover type.
mode_cover_type = amazon_bs_books['cover_type'].mode()[0]
amazon_bs_books['cover_type'].fillna(mode_cover_type, inplace=True)

# Filling missing values in 'price' and 'ratings' with the median value
median_price = amazon_bs_books['price'].median()
median_ratings = amazon_bs_books['ratings'].median()
amazon_bs_books['price'].fillna(median_price, inplace=True)
amazon_bs_books['ratings'].fillna(median_ratings, inplace=True)

# Displaying the updated dataset
print("Updated dataset after filling missing values:")
print(amazon_bs_books.isnull().sum())

Updated dataset after filling missing values:
Unnamed: 0       0
price            0
ranks            0
title            0
no_of_reviews    0
ratings          0
author           0
cover_type       0
year             0
genre            0
dtype: int64


# Now, moving on

In [21]:
# Extracting relevant columns
relevant_columns = ['title', 'author', 'price', 'ratings', 'year', 'cover_type', 'genre']
amazon_bs_books_subset = amazon_bs_books[relevant_columns]

In [22]:
# Splitting the 'author' column into 'first_name' and 'last_name'
amazon_bs_books_subset[['first_name', 'last_name']] = amazon_bs_books_subset['author'].str.split(' ', n=1, expand=True)

# Displaying the updated dataset
amazon_bs_books_subset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amazon_bs_books_subset[['first_name', 'last_name']] = amazon_bs_books_subset['author'].str.split(' ', n=1, expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amazon_bs_books_subset[['first_name', 'last_name']] = amazon_bs_books_subset['author'].str.split(' ', n=1, expand=True)


Unnamed: 0,title,author,price,ratings,year,cover_type,genre,first_name,last_name
0,The Lost Symbol,Dan Brown,12.49,4.4,2009,Hardcover,Fiction,Dan,Brown
1,The Shack: Where Tragedy Confronts Eternity,William P. Young,13.4,4.7,2009,Paperback,Fiction,William,P. Young
2,Liberty and Tyranny: A Conservative Manifesto,Mark R. Levin,9.93,4.8,2009,Hardcover,Non Fiction,Mark,R. Levin
3,"Breaking Dawn (The Twilight Saga, Book 4)",Stephenie Meyer,14.3,4.7,2009,Hardcover,Fiction,Stephenie,Meyer
4,Going Rogue: An American Life,Sarah Palin,9.99,4.6,2009,Hardcover,Non Fiction,Sarah,Palin


# Strings, String Operations, and Methods
Let's try to see how basic string operations works (concatenation, repetition, slicing and length) and also

Let's try and see basic string methods works (Capitalize, Upper and Lower Case, Find and Replace)

# Concatenation of strings (objects)

In [23]:
# Now lets try to cocatenate back the 'first_name' and the 'last_name'columns

# Concatenating 'first_name' and 'last_name' columns back together
amazon_bs_books_subset['full_name'] = amazon_bs_books_subset['first_name'] + ' ' + amazon_bs_books_subset['last_name']

In [24]:
# checking if it worked
amazon_bs_books_subset

Unnamed: 0,title,author,price,ratings,year,cover_type,genre,first_name,last_name,full_name
0,The Lost Symbol,Dan Brown,12.49,4.4,2009,Hardcover,Fiction,Dan,Brown,Dan Brown
1,The Shack: Where Tragedy Confronts Eternity,William P. Young,13.40,4.7,2009,Paperback,Fiction,William,P. Young,William P. Young
2,Liberty and Tyranny: A Conservative Manifesto,Mark R. Levin,9.93,4.8,2009,Hardcover,Non Fiction,Mark,R. Levin,Mark R. Levin
3,"Breaking Dawn (The Twilight Saga, Book 4)",Stephenie Meyer,14.30,4.7,2009,Hardcover,Fiction,Stephenie,Meyer,Stephenie Meyer
4,Going Rogue: An American Life,Sarah Palin,9.99,4.6,2009,Hardcover,Non Fiction,Sarah,Palin,Sarah Palin
...,...,...,...,...,...,...,...,...,...,...
1286,Will,Unknown,16.69,4.8,2021,Hardcover,unknown,Unknown,,
1287,Think and Grow Rich: The Landmark Bestseller N...,Napoleon Hill,7.49,4.7,2021,Paperback,unknown,Napoleon,Hill,Napoleon Hill
1288,Dragons Love Tacos,Adam Rubin,8.95,4.8,2021,Hardcover,unknown,Adam,Rubin,Adam Rubin
1289,The Truth About COVID-19: Exposing The Great R...,Unknown,7.59,4.8,2021,Hardcover,unknown,Unknown,,


In [25]:
# it worked because the 'first name' and 'last name'' columns are objects(strings)
amazon_bs_books_subset.dtypes

title          object
author         object
price         float64
ratings       float64
year            int64
cover_type     object
genre          object
first_name     object
last_name      object
full_name      object
dtype: object

In [None]:
# Dropping the original 'author' column and unnecessary 'first_name' and 'last_name' columns
# amazon_bs_books_subset.drop(columns=['author', 'first_name', 'last_name'], inplace=True)

# this is what you should normally do to avoid having redundant columns but we will ignore it for now

In [26]:
# let's try and concatenate the first name and price columns
amazon_bs_books_subset['first_name_price'] = amazon_bs_books_subset['first_name'] + ' ' + amazon_bs_books_subset['price']

TypeError: can only concatenate str (not "float") to str

 # Error because string and integers cannot be concatenated

# Let's Continue!

In [None]:
Slicing of Strings (objects)
In pandas, iloc and loc are attributes used for slicing. They are used for indexing and selecting subsets of data from a DataFrame or Series.

iloc: This attribute is primarily integer-based indexing. It is used for selecting rows and columns by their integer index. You can use iloc to select specific rows and columns based on their integer position.

Example:

# Selecting the first row and first column
df.iloc[0, 0]

# Selecting the first three rows and all columns
df.iloc[:3, :]

# Selecting all rows and the first three columns
df.iloc[:, :3]
loc: This attribute is label-based indexing. It is used for selecting rows and columns by their label or boolean array. You can use loc to select specific rows and columns based on their labels or boolean conditions.

Example:


# Selecting the row with label 'A' and all columns
df.loc['A', :]

# Selecting rows with labels 'A' and 'B' and all columns
df.loc[['A', 'B'], :]

# Selecting rows where the value in column 'A' is greater than 0
df.loc[df['A'] > 0, :]

In [27]:
# Remember, we are working with this dataset
amazon_bs_books_subset.head()

Unnamed: 0,title,author,price,ratings,year,cover_type,genre,first_name,last_name,full_name
0,The Lost Symbol,Dan Brown,12.49,4.4,2009,Hardcover,Fiction,Dan,Brown,Dan Brown
1,The Shack: Where Tragedy Confronts Eternity,William P. Young,13.4,4.7,2009,Paperback,Fiction,William,P. Young,William P. Young
2,Liberty and Tyranny: A Conservative Manifesto,Mark R. Levin,9.93,4.8,2009,Hardcover,Non Fiction,Mark,R. Levin,Mark R. Levin
3,"Breaking Dawn (The Twilight Saga, Book 4)",Stephenie Meyer,14.3,4.7,2009,Hardcover,Fiction,Stephenie,Meyer,Stephenie Meyer
4,Going Rogue: An American Life,Sarah Palin,9.99,4.6,2009,Hardcover,Non Fiction,Sarah,Palin,Sarah Palin


In [28]:
# Selecting a few titles from the DataFrame

# We could just do this
selected_titles = amazon_bs_books_subset['title'].head()
selected_titles

0                                  The Lost Symbol
1      The Shack: Where Tragedy Confronts Eternity
2    Liberty and Tyranny: A Conservative Manifesto
3        Breaking Dawn (The Twilight Saga, Book 4)
4                    Going Rogue: An American Life
Name: title, dtype: object

In [29]:
# But we can use the loc method to specifically select all rows within the title columns

amazon_bs_books_subset.loc[:, 'title'] # usng loc to access all rows within the title column

0                                         The Lost Symbol
1             The Shack: Where Tragedy Confronts Eternity
2           Liberty and Tyranny: A Conservative Manifesto
3               Breaking Dawn (The Twilight Saga, Book 4)
4                           Going Rogue: An American Life
                              ...                        
1286                                                 Will
1287    Think and Grow Rich: The Landmark Bestseller N...
1288                                   Dragons Love Tacos
1289    The Truth About COVID-19: Exposing The Great R...
1290    First Little Readers Parent Pack: Guided Readi...
Name: title, Length: 1291, dtype: object

In [30]:
# Slicing the first title to extract the first 10 characters

selected_titles.iloc[0][:10]  # Using iloc to access the first title and slicing

'The Lost S'

In [31]:
# Slicing the second title to extract characters from index 5 to index 15

selected_titles.iloc[1][5:15]  # Using iloc to access the second title and slicing

'hack: Wher'

In [32]:
# Slicing the third title to extract the last 5 characters

selected_titles.iloc[2][-5:]  # Using iloc to access the third title and slicing

'festo'

# Capitalize, Upper and Lower Case, Find and Replace

In [42]:
# http://localhost:8888/notebooks/OneDrive/Documents/Documents/HerTechTrail/Data%20Analytics%20Pro%20Track/Week%203%20-%20Jupter%20Notebook/Week%203%20-%20Sunday%20-%20Practising%20All%20We%20Learned%20under%20Python%20Fundamentals%20.ipynb#Capitalize,-Upper-and-Lower-Case,-Find-and-Replace# Convert titles to lowercase in the DataFrame
amazon_bs_books_subset['title_lower'] = amazon_bs_books_subset['title'].str.lower()

# Convert titles to upper case in the DataFrame
amazon_bs_books_subset['title_upper'] = amazon_bs_books_subset['title'].str.upper()

amazon_bs_books_subset.head()

Unnamed: 0,title,author,price,ratings,year,cover_type,genre,first_name,last_name,full_name,title_lower,title_upper
0,The Lost Symbol,Dan Brown,12.49,4.4,2009,Hardcover,Fiction,Dan,Brown,Dan Brown,the lost symbol,THE LOST SYMBOL
1,The Shack: Where Tragedy Confronts Eternity,William P. Young,13.4,4.7,2009,Paperback,Fiction,William,P. Young,William P. Young,the shack: where tragedy confronts eternity,THE SHACK: WHERE TRAGEDY CONFRONTS ETERNITY
2,Liberty and Tyranny: A Conservative Manifesto,Mark R. Levin,9.93,4.8,2009,Hardcover,Non Fiction,Mark,R. Levin,Mark R. Levin,liberty and tyranny: a conservative manifesto,LIBERTY AND TYRANNY: A CONSERVATIVE MANIFESTO
3,"Breaking Dawn (The Twilight Saga, Book 4)",Stephenie Meyer,14.3,4.7,2009,Hardcover,Fiction,Stephenie,Meyer,Stephenie Meyer,"breaking dawn (the twilight saga, book 4)","BREAKING DAWN (THE TWILIGHT SAGA, BOOK 4)"
4,Going Rogue: An American Life,Sarah Palin,9.99,4.6,2009,Hardcover,Non Fiction,Sarah,Palin,Sarah Palin,going rogue: an american life,GOING ROGUE: AN AMERICAN LIFE


In [43]:
# Capitalize the titles in the DataFrame
amazon_bs_books_subset['title_capitalized'] = amazon_bs_books_subset['title_lower'].str.capitalize()

# Display the DataFrame with lowercase and capitalized titles
amazon_bs_books_subset[['title', 'title_lower', 'title_upper', 'title_capitalized']].head()

Unnamed: 0,title,title_lower,title_upper,title_capitalized
0,The Lost Symbol,the lost symbol,THE LOST SYMBOL,The lost symbol
1,The Shack: Where Tragedy Confronts Eternity,the shack: where tragedy confronts eternity,THE SHACK: WHERE TRAGEDY CONFRONTS ETERNITY,The shack: where tragedy confronts eternity
2,Liberty and Tyranny: A Conservative Manifesto,liberty and tyranny: a conservative manifesto,LIBERTY AND TYRANNY: A CONSERVATIVE MANIFESTO,Liberty and tyranny: a conservative manifesto
3,"Breaking Dawn (The Twilight Saga, Book 4)","breaking dawn (the twilight saga, book 4)","BREAKING DAWN (THE TWILIGHT SAGA, BOOK 4)","Breaking dawn (the twilight saga, book 4)"
4,Going Rogue: An American Life,going rogue: an american life,GOING ROGUE: AN AMERICAN LIFE,Going rogue: an american life


In [None]:
# Find and Replace

# The find() method is indeed a built-in method for Python strings, but it's not directly applicable to Pandas DataFrame columns.

So, do we apply the find() method in Pandas DatafFrame?

By using methods like str.contains() or str.match() to find specific patterns within strings in DataFrame columns.

In [44]:
# Find books with 'Hardcover' type in the cover_type column
hardcover_books = amazon_bs_books_subset[amazon_bs_books_subset['cover_type'].str.contains('Hardcover', na=False)]

# Display the DataFrame with books having 'Hardcover' type
hardcover_books

# 0R

hardcover_books.head()

Unnamed: 0,title,author,price,ratings,year,cover_type,genre,first_name,last_name,full_name,title_lower,title_upper,title_capitalized
0,The Lost Symbol,Dan Brown,12.49,4.4,2009,Hardcover,Fiction,Dan,Brown,Dan Brown,the lost symbol,THE LOST SYMBOL,The lost symbol
2,Liberty and Tyranny: A Conservative Manifesto,Mark R. Levin,9.93,4.8,2009,Hardcover,Non Fiction,Mark,R. Levin,Mark R. Levin,liberty and tyranny: a conservative manifesto,LIBERTY AND TYRANNY: A CONSERVATIVE MANIFESTO,Liberty and tyranny: a conservative manifesto
3,"Breaking Dawn (The Twilight Saga, Book 4)",Stephenie Meyer,14.3,4.7,2009,Hardcover,Fiction,Stephenie,Meyer,Stephenie Meyer,"breaking dawn (the twilight saga, book 4)","BREAKING DAWN (THE TWILIGHT SAGA, BOOK 4)","Breaking dawn (the twilight saga, book 4)"
4,Going Rogue: An American Life,Sarah Palin,9.99,4.6,2009,Hardcover,Non Fiction,Sarah,Palin,Sarah Palin,going rogue: an american life,GOING ROGUE: AN AMERICAN LIFE,Going rogue: an american life
5,StrengthsFinder 2.0,Gallup,18.29,4.1,2009,Hardcover,Non Fiction,Gallup,,,strengthsfinder 2.0,STRENGTHSFINDER 2.0,Strengthsfinder 2.0


In [45]:
# Replace 'Hardcover' with 'HB' in the titles
hardcover_books['cover_type_replaced'] = hardcover_books['cover_type'].str.replace('Hardcover', 'HB')

# Display the DataFrame with replaced titles
hardcover_books[['cover_type', 'cover_type_replaced']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hardcover_books['cover_type_replaced'] = hardcover_books['cover_type'].str.replace('Hardcover', 'HB')


Unnamed: 0,cover_type,cover_type_replaced
0,Hardcover,HB
2,Hardcover,HB
3,Hardcover,HB
4,Hardcover,HB
5,Hardcover,HB


# Replacing "Hardcover" cover type books with "HB" using
a) if and else statements

b) for loops

c) while loops

In [46]:
# Remember we arestill working with this data
amazon_bs_books_subset.head(3)

Unnamed: 0,title,author,price,ratings,year,cover_type,genre,first_name,last_name,full_name,title_lower,title_upper,title_capitalized
0,The Lost Symbol,Dan Brown,12.49,4.4,2009,Hardcover,Fiction,Dan,Brown,Dan Brown,the lost symbol,THE LOST SYMBOL,The lost symbol
1,The Shack: Where Tragedy Confronts Eternity,William P. Young,13.4,4.7,2009,Paperback,Fiction,William,P. Young,William P. Young,the shack: where tragedy confronts eternity,THE SHACK: WHERE TRAGEDY CONFRONTS ETERNITY,The shack: where tragedy confronts eternity
2,Liberty and Tyranny: A Conservative Manifesto,Mark R. Levin,9.93,4.8,2009,Hardcover,Non Fiction,Mark,R. Levin,Mark R. Levin,liberty and tyranny: a conservative manifesto,LIBERTY AND TYRANNY: A CONSERVATIVE MANIFESTO,Liberty and tyranny: a conservative manifesto


# Using the If and Else Statements within a For Loop

In [None]:
# Create an empty list to store the replaced cover types
replaced_cover_types = []

# Iterate through each cover type
for cover_type in amazon_bs_books_subset['cover_type']:
    # Check if the cover type contains 'Hardcover'
    if 'Hardcover' in cover_type:
        # If 'Hardcover' is found, replace it with 'HB' and append to the list
        replaced_cover_types.append(cover_type.replace('Hardcover', 'HB'))
    else:
        # If 'Hardcover' is not found, keep the original cover type
        replaced_cover_types.append(cover_type)

# Add the list of replaced cover types as a new column to the DataFrame
amazon_bs_books_subset['cover_type_replaced'] = replaced_cover_types

In [47]:
amazon_bs_books_subset.head(3)

Unnamed: 0,title,author,price,ratings,year,cover_type,genre,first_name,last_name,full_name,title_lower,title_upper,title_capitalized
0,The Lost Symbol,Dan Brown,12.49,4.4,2009,Hardcover,Fiction,Dan,Brown,Dan Brown,the lost symbol,THE LOST SYMBOL,The lost symbol
1,The Shack: Where Tragedy Confronts Eternity,William P. Young,13.4,4.7,2009,Paperback,Fiction,William,P. Young,William P. Young,the shack: where tragedy confronts eternity,THE SHACK: WHERE TRAGEDY CONFRONTS ETERNITY,The shack: where tragedy confronts eternity
2,Liberty and Tyranny: A Conservative Manifesto,Mark R. Levin,9.93,4.8,2009,Hardcover,Non Fiction,Mark,R. Levin,Mark R. Levin,liberty and tyranny: a conservative manifesto,LIBERTY AND TYRANNY: A CONSERVATIVE MANIFESTO,Liberty and tyranny: a conservative manifesto


# Using the While Loop

In [48]:
# Create an empty list to store the replaced cover types
replaced_cover_types_2 = []

# Get the number of rows in the DataFrame
num_rows = len(amazon_bs_books_subset)

# Initialize the loop counter
i = 0

# Iterate through each row using a while loop
while i < num_rows:
    # Get the cover type of the current row
    cover_type = amazon_bs_books_subset.loc[i, 'cover_type']
    # Check if the cover type contains 'Hardcover'
    if 'Hardcover' in cover_type:
        # If 'Hardcover' is found, replace it with 'HB' and append to the list
        replaced_cover_types_2.append(cover_type.replace('Hardcover', 'HB'))
    else:
        # If 'Hardcover' is not found, keep the original cover type
        replaced_cover_types_2.append(cover_type)
    # Move to the next row
    i += 1

# Add the list of replaced cover types as a new column to the DataFrame
amazon_bs_books_subset['cover_type_replaced_2'] = replaced_cover_types


NameError: name 'replaced_cover_types' is not defined

In [49]:
amazon_bs_books_subset.head(3)

Unnamed: 0,title,author,price,ratings,year,cover_type,genre,first_name,last_name,full_name,title_lower,title_upper,title_capitalized
0,The Lost Symbol,Dan Brown,12.49,4.4,2009,Hardcover,Fiction,Dan,Brown,Dan Brown,the lost symbol,THE LOST SYMBOL,The lost symbol
1,The Shack: Where Tragedy Confronts Eternity,William P. Young,13.4,4.7,2009,Paperback,Fiction,William,P. Young,William P. Young,the shack: where tragedy confronts eternity,THE SHACK: WHERE TRAGEDY CONFRONTS ETERNITY,The shack: where tragedy confronts eternity
2,Liberty and Tyranny: A Conservative Manifesto,Mark R. Levin,9.93,4.8,2009,Hardcover,Non Fiction,Mark,R. Levin,Mark R. Levin,liberty and tyranny: a conservative manifesto,LIBERTY AND TYRANNY: A CONSERVATIVE MANIFESTO,Liberty and tyranny: a conservative manifesto


# Showcasig How Python Operators Work
Remember we have:

Arithmetic Operators
Arithmetic operators are used to perform mathematical operations like addition, subtraction, multiplication

Assignmemt Operators
Assignment operators are used to assign values to variables. e.g (=)

Comparison Operators
Comparison operators in Python are symbols used to compare two values or expressions, resulting in a Boolean value indicating whether the comparison is true or false. e.g (==), (>), (<), e.t.c

Logical Operators
Logical operators are used to check whether an expression is True or False. They are used in decision-making. e.g 0r, and, and nor.

In [50]:
# Using Greater_than '>' sign comparison operator
    
# Books with rating above a certain threshold
threshold = 4.5
high_rated_books = amazon_bs_books_subset[amazon_bs_books_subset['ratings'] > threshold]

print("\nBooks with a rating above", threshold, ":")
print(high_rated_books[['title', 'ratings']])


Books with a rating above 4.5 :
                                                  title  ratings
1           The Shack: Where Tragedy Confronts Eternity      4.7
2         Liberty and Tyranny: A Conservative Manifesto      4.8
3             Breaking Dawn (The Twilight Saga, Book 4)      4.7
4                         Going Rogue: An American Life      4.6
6                                              The Help      4.8
...                                                 ...      ...
1286                                               Will      4.8
1287  Think and Grow Rich: The Landmark Bestseller N...      4.7
1288                                 Dragons Love Tacos      4.8
1289  The Truth About COVID-19: Exposing The Great R...      4.8
1290  First Little Readers Parent Pack: Guided Readi...      4.7

[982 rows x 2 columns]


In [51]:
# Using the Equal to (==) sign comparison operator

# Books published in a specific year
specific_year = 2020
books_in_specific_year = amazon_bs_books_subset[amazon_bs_books_subset['year'] == specific_year]

print("\nBooks published in", specific_year, ":")
print(books_in_specific_year[['title', 'year']])


Books published in 2020 :
                                                  title  year
1091                                    A Promised Land  2020
1092  Too Much and Never Enough: How My Family Creat...  2020
1093                            Where the Crawdads Sing  2020
1094  My First Learn-to-Write Workbook: Practice for...  2020
1095                                       Midnight Sun  2020
...                                                 ...   ...
1186  Relationship Goals: How to Win at Dating, Marr...  2020
1187  The Happy in a Hurry Cookbook: 100-Plus Fast a...  2020
1188  Learn to Read: A Magical Sight Words and Phoni...  2020
1189    P is for Potty! (Sesame Street) (Lift-the-Flap)  2020
1190  D&D Player's Handbook (Dungeons & Dragons Core...  2020

[100 rows x 2 columns]


# A demonstration: Using Try and Except statement

In [52]:
# Trying to access a column that does not exists

try:
    # Attempt to access a column that may or may not exist
    unknown_column = amazon_bs_books_subset['unknown_column']
    print("Accessing the unknown column succeeded. The first few values are:")
    print(unknown_column.head())
except KeyError:
    # Handle the KeyError exception if the column does not exist
    print("The specified column does not exist.")

The specified column does not exist.


# A demonstration: Creating a Function


In [53]:
# Let's create a function that calculates the average price of the books

def calculate_average_price(df):
    """Calculate the average price of books."""
    average_price = df['price'].mean()
    return average_price

# Call the function and pass the Amazon Best Selling Books subset DataFrame
avg_price = calculate_average_price(amazon_bs_books_subset)
print("Average price of books:", avg_price)

Average price of books: 13.75198295894655


In [64]:
# let's confirm this
avg_price = amazon_bs_books_subset['price'].mean()

avg_price

13.75198295894655

In [None]:
You see that the average price is really 13.7
avg_price