In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Indicate file path and load it as df 
file_path = '../Resources/scraped_dog_food_20240121.csv'
dog_kibble_df = pd.read_csv(file_path)

dog_kibble_df.head()

Unnamed: 0,Product Name,Price,Bag Size,Description,Ingredients,Guaranteed Analysis,Scraping Date,Product Link
0,performatrin Ultra Limited Ingredient Kangaroo...,$112.99,24 lb,performatrin Ultra Limited Ingredient Kangaroo...,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils...",Crude Protein (min.) 24.0% Crude Fat (min.) 14...,2024-01-21,https://www.petvalu.ca/product/performatrin-ul...
1,performatrin Ultra Wholesome Grains Lamb & Bro...,$86.99,24 lb,performatrin Ultra Wholesome Grains Lamb & Bro...,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice, Pe...",Crude Protein (min.) 22.0% Crude Fat (min.) 12...,2024-01-21,https://www.petvalu.ca/product/performatrin-ul...
2,performatrin Prime Chicken & Rice Formula Larg...,$99.99,37 lb,performatrin Prime Chicken & Rice Formula Larg...,"Chicken, Chicken Meal (source of Glucosamine a...",Crude Protein (min.) 24.0% Crude Fat (min.) 12...,2024-01-21,https://www.petvalu.ca/product/performatrin-pr...
3,performatrin Ultra Limited Ingredient Sweet Po...,$115.99,24 lb,performatrin Ultra Limited Ingredient Sweet Po...,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",Crude Protein (min.) 21.0% Crude Fat (min.) 11...,2024-01-21,https://www.petvalu.ca/product/performatrin-ul...
4,ACANA Classics Red Meat Recipe Dog Food,$98.99,31.9 lb,ACANA Classics Red Meat Recipe Dog Food is a m...,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",Crude Protein 27% Fat Content 16% Crude Ash 8%...,2024-01-21,https://www.petvalu.ca/product/acana-classic-r...


In [3]:
# Get the number of rows & column names
num_rows = dog_kibble_df.shape[0]
column_names = dog_kibble_df.columns.tolist()

print(f"Number of rows: {num_rows}")
print(f"Column names: {column_names}")

Number of rows: 346
Column names: ['Product Name', 'Price', 'Bag Size', 'Description', 'Ingredients', 'Guaranteed Analysis', 'Scraping Date', 'Product Link']


In [4]:
# Drop empty rows
dog_kibble_df.dropna(inplace=True)
num_rows_cleaned = dog_kibble_df.shape[0]

print(f"Number of rows after removing empty rows: {num_rows_cleaned}")

Number of rows after removing empty rows: 346


In [5]:
# Separate the Brand Name from the Kibble Name
brand_names = ["ACANA", "Barker's Complete", "Blue Buffalo", "Diamond Naturals", "Diamond Care", 
               "Gather", "Go! Solutions", "Hill's Science Diet", "Iams", "Lifetime", 
               "Merrick", "Natural Balance", "Now Fresh", "Nutro", "Open Farm", "ORIJEN", 
               "performatrin", "performatrin NATURALS", "performatrin Prime", 
               "performatrin Ultra", "Purina", "Purina Pro Plan", "Royal Canin", "Stella & Chewy's", 
               "Summit", "Taste of the Wild", "Wilder Harrier", "Zignature"]

# Create a pattern to match brand names
brand_pattern = '|'.join(brand_names)

# Extract brand and kibble name using regex
dog_kibble_df[['Brand', 'Kibble Name']] = dog_kibble_df['Product Name'].str.extract(f'({brand_pattern})\s?(.*)', expand=True)

### SEPARATES THE BRANDS EXCEPT FOR WHEN THE BRAND NAME REPEATS LIKE "performatrin", "performatrin NATURALS", "Purina", "Purina Pro Plan"

In [6]:
# Separate sub brands
sub_brand_names = ["NATURALS", "Prime", "Ultra", "Pro Plan", "Naturals", "Care"]

# Function to check if Kibble Name contains any sub-brand name
def extract_sub_brand(kibble_name):
    return next((sub_brand for sub_brand in sub_brand_names if isinstance(kibble_name, str) and sub_brand in kibble_name), None)

# Create 'Sub Brand' column
dog_kibble_df['Sub Brand'] = dog_kibble_df['Kibble Name'].apply(extract_sub_brand)

# Update 'Brand' column by joining with 'Sub Brand' if applicable
dog_kibble_df['Brand'] = np.where(~dog_kibble_df['Sub Brand'].isna(), dog_kibble_df['Brand'] + ' ' + dog_kibble_df['Sub Brand'], dog_kibble_df['Brand'])

# Remove sub-brand from 'Kibble Name'
def remove_sub_brand(kibble_name, sub_brand):
    if isinstance(kibble_name, str) and isinstance(sub_brand, str):
        return kibble_name.replace(sub_brand, '').strip()
    return kibble_name

dog_kibble_df['Kibble Name'] = dog_kibble_df.apply(lambda row: remove_sub_brand(row['Kibble Name'], row['Sub Brand']), axis=1)

In [7]:
# Remove '$' from 'Price' values and convert to numeric
dog_kibble_df['Price($)'] = pd.to_numeric(dog_kibble_df['Price'].str.replace(r'[^0-9.]', '', regex=True), errors='coerce')

# Some Values in "Price" look like this $49.99$56.99 due to being on sale while scraping  
# Identify rows with the specified format in 'Price'
odd_price_indices = dog_kibble_df[dog_kibble_df['Price'].astype(str).str.contains(r'\$\d+\.\d+\$\d+\.\d+')].index

# Replace the first number in 'Price' column for the identified rows with an empty string
dog_kibble_df.loc[odd_price_indices, 'Price($)'] = dog_kibble_df.loc[odd_price_indices, 'Price'].astype(str).replace(to_replace=r'^(\$\d+\.\d+)\$(\d+\.\d+)$', value=r'\2', regex=True).astype(float)

#dog_kibble_df.loc[odd_price_indices, ['Price($)']]

In [8]:
# Separate Top 5 ingridients 
dog_kibble_df['TOP 5 Ingredients'] = dog_kibble_df['Ingredients'].apply(lambda x: ', '.join(str(x).split(', ')[:5]))

# Extract only the numerical part from 'Bag Size'
dog_kibble_df['Bag Size(lbs)'] = dog_kibble_df['Bag Size'].str.extract(r'(\d+)', expand=False)

# Convert 'Bag Size(lbs)' to numeric values
dog_kibble_df['Bag Size(lbs)'] = pd.to_numeric(dog_kibble_df['Bag Size(lbs)'], errors='coerce')

# Create 'Bag Size(kg)' column by converting 'Bag Size(lbs)' to kilograms (1 lb = 0.453592 kg)
dog_kibble_df['Bag Size(kg)'] = round(dog_kibble_df['Bag Size(lbs)'] * 0.453592, 2)

dog_kibble_df.head()

Unnamed: 0,Product Name,Price,Bag Size,Description,Ingredients,Guaranteed Analysis,Scraping Date,Product Link,Brand,Kibble Name,Sub Brand,Price($),TOP 5 Ingredients,Bag Size(lbs),Bag Size(kg)
0,performatrin Ultra Limited Ingredient Kangaroo...,$112.99,24 lb,performatrin Ultra Limited Ingredient Kangaroo...,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils...",Crude Protein (min.) 24.0% Crude Fat (min.) 14...,2024-01-21,https://www.petvalu.ca/product/performatrin-ul...,performatrin Ultra,Limited Ingredient Kangaroo Recipe Adult Dog Food,Ultra,112.99,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils",24,10.89
1,performatrin Ultra Wholesome Grains Lamb & Bro...,$86.99,24 lb,performatrin Ultra Wholesome Grains Lamb & Bro...,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice, Pe...",Crude Protein (min.) 22.0% Crude Fat (min.) 12...,2024-01-21,https://www.petvalu.ca/product/performatrin-ul...,performatrin Ultra,Wholesome Grains Lamb & Brown Rice Recipe Adul...,Ultra,86.99,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice",24,10.89
2,performatrin Prime Chicken & Rice Formula Larg...,$99.99,37 lb,performatrin Prime Chicken & Rice Formula Larg...,"Chicken, Chicken Meal (source of Glucosamine a...",Crude Protein (min.) 24.0% Crude Fat (min.) 12...,2024-01-21,https://www.petvalu.ca/product/performatrin-pr...,performatrin Prime,Chicken & Rice Formula Large Breed Adult Dog Food,Prime,99.99,"Chicken, Chicken Meal (source of Glucosamine a...",37,16.78
3,performatrin Ultra Limited Ingredient Sweet Po...,$115.99,24 lb,performatrin Ultra Limited Ingredient Sweet Po...,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",Crude Protein (min.) 21.0% Crude Fat (min.) 11...,2024-01-21,https://www.petvalu.ca/product/performatrin-ul...,performatrin Ultra,Limited Ingredient Sweet Potato & Venison Reci...,Ultra,115.99,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",24,10.89
4,ACANA Classics Red Meat Recipe Dog Food,$98.99,31.9 lb,ACANA Classics Red Meat Recipe Dog Food is a m...,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",Crude Protein 27% Fat Content 16% Crude Ash 8%...,2024-01-21,https://www.petvalu.ca/product/acana-classic-r...,ACANA,Classics Red Meat Recipe Dog Food,,98.99,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",31,14.06


In [9]:
# Price per kg and price per lbs 
dog_kibble_df['Price per 1kg'] = round(dog_kibble_df['Price($)'] / dog_kibble_df['Bag Size(kg)'],2)

dog_kibble_df['Price per 1lbs'] = round(dog_kibble_df['Price($)'] / dog_kibble_df['Bag Size(lbs)'],2)

In [10]:
# Rename and Drop columns
dog_kibble_df.drop(['Product Name', 'Sub Brand', 'Price', 'Bag Size'], axis=1, inplace=True)

dog_kibble_df.rename(columns={'Ingredients': 'Full Ingredient List'}, inplace=True)


In [11]:
# Reorder columns
new_order = ['Brand', 'Kibble Name', 'Bag Size(lbs)', 'Bag Size(kg)', 'Price($)', 'Price per 1lbs', 'Price per 1kg', 'TOP 5 Ingredients', 'Description',
             'Full Ingredient List', 'Guaranteed Analysis', 'Product Link', 'Scraping Date']

dog_kibble_df = dog_kibble_df[new_order]

In [12]:
dog_kibble_df.head()

Unnamed: 0,Brand,Kibble Name,Bag Size(lbs),Bag Size(kg),Price($),Price per 1lbs,Price per 1kg,TOP 5 Ingredients,Description,Full Ingredient List,Guaranteed Analysis,Product Link,Scraping Date
0,performatrin Ultra,Limited Ingredient Kangaroo Recipe Adult Dog Food,24,10.89,112.99,4.71,10.38,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils",performatrin Ultra Limited Ingredient Kangaroo...,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils...",Crude Protein (min.) 24.0% Crude Fat (min.) 14...,https://www.petvalu.ca/product/performatrin-ul...,2024-01-21
1,performatrin Ultra,Wholesome Grains Lamb & Brown Rice Recipe Adul...,24,10.89,86.99,3.62,7.99,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice",performatrin Ultra Wholesome Grains Lamb & Bro...,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice, Pe...",Crude Protein (min.) 22.0% Crude Fat (min.) 12...,https://www.petvalu.ca/product/performatrin-ul...,2024-01-21
2,performatrin Prime,Chicken & Rice Formula Large Breed Adult Dog Food,37,16.78,99.99,2.7,5.96,"Chicken, Chicken Meal (source of Glucosamine a...",performatrin Prime Chicken & Rice Formula Larg...,"Chicken, Chicken Meal (source of Glucosamine a...",Crude Protein (min.) 24.0% Crude Fat (min.) 12...,https://www.petvalu.ca/product/performatrin-pr...,2024-01-21
3,performatrin Ultra,Limited Ingredient Sweet Potato & Venison Reci...,24,10.89,115.99,4.83,10.65,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",performatrin Ultra Limited Ingredient Sweet Po...,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",Crude Protein (min.) 21.0% Crude Fat (min.) 11...,https://www.petvalu.ca/product/performatrin-ul...,2024-01-21
4,ACANA,Classics Red Meat Recipe Dog Food,31,14.06,98.99,3.19,7.04,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",ACANA Classics Red Meat Recipe Dog Food is a m...,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",Crude Protein 27% Fat Content 16% Crude Ash 8%...,https://www.petvalu.ca/product/acana-classic-r...,2024-01-21


In [13]:
# Remove duplicate rows
dog_kibble_df.drop_duplicates(inplace=True)

# Check for missing values in other columns
missing_values = dog_kibble_df.isnull().sum()
missing_values

Brand                   0
Kibble Name             0
Bag Size(lbs)           0
Bag Size(kg)            0
Price($)                0
Price per 1lbs          0
Price per 1kg           0
TOP 5 Ingredients       0
Description             0
Full Ingredient List    0
Guaranteed Analysis     0
Product Link            0
Scraping Date           0
dtype: int64

In [14]:
# Extract the 'Protein Content','Fat Content', and 'Fiber Content'
dog_kibble_df['Protein Content min(%)'] = dog_kibble_df['Guaranteed Analysis'].str.extract(r'(?:protein|Protein)\D*(\d+\.*\d*)', flags=re.IGNORECASE)
dog_kibble_df['Fat Content min(%)'] = dog_kibble_df['Guaranteed Analysis'].str.extract(r'(?:fat|Fat)\D*(\d+\.*\d*)', flags=re.IGNORECASE)
dog_kibble_df['Fiber Content min(%)'] = dog_kibble_df['Guaranteed Analysis'].str.extract(r'(?:fiber|Fibre)\D*(\d+\.*\d*)', flags=re.IGNORECASE)
dog_kibble_df['Moisture Content min(%)'] = dog_kibble_df['Guaranteed Analysis'].str.extract(r'(?:moisture|Moisture)\D*(\d+\.*\d*)', flags=re.IGNORECASE)

# Convert the 'Protein Content' column to numeric
dog_kibble_df['Protein Content min(%)'] = pd.to_numeric(dog_kibble_df['Protein Content min(%)'], errors='coerce')
dog_kibble_df['Fat Content min(%)'] = pd.to_numeric(dog_kibble_df['Fat Content min(%)'], errors='coerce')
dog_kibble_df['Fiber Content min(%)'] = pd.to_numeric(dog_kibble_df['Fiber Content min(%)'], errors='coerce')
dog_kibble_df['Moisture Content min(%)'] = pd.to_numeric(dog_kibble_df['Moisture Content min(%)'], errors='coerce')

# Print the updated DataFrame
dog_kibble_df.head()

Unnamed: 0,Brand,Kibble Name,Bag Size(lbs),Bag Size(kg),Price($),Price per 1lbs,Price per 1kg,TOP 5 Ingredients,Description,Full Ingredient List,Guaranteed Analysis,Product Link,Scraping Date,Protein Content min(%),Fat Content min(%),Fiber Content min(%),Moisture Content min(%)
0,performatrin Ultra,Limited Ingredient Kangaroo Recipe Adult Dog Food,24,10.89,112.99,4.71,10.38,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils",performatrin Ultra Limited Ingredient Kangaroo...,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils...",Crude Protein (min.) 24.0% Crude Fat (min.) 14...,https://www.petvalu.ca/product/performatrin-ul...,2024-01-21,24.0,14.0,5.5,10.0
1,performatrin Ultra,Wholesome Grains Lamb & Brown Rice Recipe Adul...,24,10.89,86.99,3.62,7.99,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice",performatrin Ultra Wholesome Grains Lamb & Bro...,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice, Pe...",Crude Protein (min.) 22.0% Crude Fat (min.) 12...,https://www.petvalu.ca/product/performatrin-ul...,2024-01-21,22.0,12.0,4.5,10.0
2,performatrin Prime,Chicken & Rice Formula Large Breed Adult Dog Food,37,16.78,99.99,2.7,5.96,"Chicken, Chicken Meal (source of Glucosamine a...",performatrin Prime Chicken & Rice Formula Larg...,"Chicken, Chicken Meal (source of Glucosamine a...",Crude Protein (min.) 24.0% Crude Fat (min.) 12...,https://www.petvalu.ca/product/performatrin-pr...,2024-01-21,24.0,12.0,4.0,10.0
3,performatrin Ultra,Limited Ingredient Sweet Potato & Venison Reci...,24,10.89,115.99,4.83,10.65,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",performatrin Ultra Limited Ingredient Sweet Po...,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",Crude Protein (min.) 21.0% Crude Fat (min.) 11...,https://www.petvalu.ca/product/performatrin-ul...,2024-01-21,21.0,11.0,4.5,10.0
4,ACANA,Classics Red Meat Recipe Dog Food,31,14.06,98.99,3.19,7.04,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",ACANA Classics Red Meat Recipe Dog Food is a m...,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",Crude Protein 27% Fat Content 16% Crude Ash 8%...,https://www.petvalu.ca/product/acana-classic-r...,2024-01-21,27.0,16.0,5.0,12.0


In [15]:
missing_values = dog_kibble_df.isnull().sum()
missing_values

Brand                      0
Kibble Name                0
Bag Size(lbs)              0
Bag Size(kg)               0
Price($)                   0
Price per 1lbs             0
Price per 1kg              0
TOP 5 Ingredients          0
Description                0
Full Ingredient List       0
Guaranteed Analysis        0
Product Link               0
Scraping Date              0
Protein Content min(%)     2
Fat Content min(%)         0
Fiber Content min(%)       2
Moisture Content min(%)    3
dtype: int64

In [16]:
# Drop rows with missing values
dog_kibble_df.dropna(inplace=True)

# Reset index after dropping rows
dog_kibble_df.reset_index(drop=True, inplace=True)

In [17]:
missing_values = dog_kibble_df.isnull().sum()
missing_values

Brand                      0
Kibble Name                0
Bag Size(lbs)              0
Bag Size(kg)               0
Price($)                   0
Price per 1lbs             0
Price per 1kg              0
TOP 5 Ingredients          0
Description                0
Full Ingredient List       0
Guaranteed Analysis        0
Product Link               0
Scraping Date              0
Protein Content min(%)     0
Fat Content min(%)         0
Fiber Content min(%)       0
Moisture Content min(%)    0
dtype: int64

In [18]:
dog_kibble_df.dtypes

Brand                       object
Kibble Name                 object
Bag Size(lbs)                int64
Bag Size(kg)               float64
Price($)                   float64
Price per 1lbs             float64
Price per 1kg              float64
TOP 5 Ingredients           object
Description                 object
Full Ingredient List        object
Guaranteed Analysis         object
Product Link                object
Scraping Date               object
Protein Content min(%)     float64
Fat Content min(%)         float64
Fiber Content min(%)       float64
Moisture Content min(%)    float64
dtype: object

In [19]:
dog_kibble_df['Scraping Date'] = pd.to_datetime(dog_kibble_df['Scraping Date'])

In [20]:
# Specify the file path to save the cleaned DataFrame
output_file_path = '../Resources/cleaned_dog_food_data.csv'

dog_kibble_df.to_csv(output_file_path, index=False)

print(f"Data saved to: {output_file_path}")

Data saved to: ../Resources/cleaned_dog_food_data.csv
