In [37]:
import pandas as pd
import numpy as np
import re

In [38]:
# Indicate file path and load it as df 
file_path = 'Resources/scraped_dog_food_20240121.csv'
dog_kibble_df = pd.read_csv(file_path)

dog_kibble_df.head()

Unnamed: 0,Product Name,Price,Bag Size,Description,Ingredients,Guaranteed Analysis,Scraping Date,Product Link
0,performatrin Ultra Limited Ingredient Kangaroo...,$112.99,24 lb,performatrin Ultra Limited Ingredient Kangaroo...,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils...",Crude Protein (min.) 24.0% Crude Fat (min.) 14...,2024-01-21,https://www.petvalu.ca/product/performatrin-ul...
1,performatrin Ultra Wholesome Grains Lamb & Bro...,$86.99,24 lb,performatrin Ultra Wholesome Grains Lamb & Bro...,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice, Pe...",Crude Protein (min.) 22.0% Crude Fat (min.) 12...,2024-01-21,https://www.petvalu.ca/product/performatrin-ul...
2,performatrin Prime Chicken & Rice Formula Larg...,$99.99,37 lb,performatrin Prime Chicken & Rice Formula Larg...,"Chicken, Chicken Meal (source of Glucosamine a...",Crude Protein (min.) 24.0% Crude Fat (min.) 12...,2024-01-21,https://www.petvalu.ca/product/performatrin-pr...
3,performatrin Ultra Limited Ingredient Sweet Po...,$115.99,24 lb,performatrin Ultra Limited Ingredient Sweet Po...,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",Crude Protein (min.) 21.0% Crude Fat (min.) 11...,2024-01-21,https://www.petvalu.ca/product/performatrin-ul...
4,ACANA Classics Red Meat Recipe Dog Food,$98.99,31.9 lb,ACANA Classics Red Meat Recipe Dog Food is a m...,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",Crude Protein 27% Fat Content 16% Crude Ash 8%...,2024-01-21,https://www.petvalu.ca/product/acana-classic-r...


In [39]:
# Get the number of rows & column names
num_rows = dog_kibble_df.shape[0]
column_names = dog_kibble_df.columns.tolist()

print(f"Number of rows: {num_rows}")
print(f"Column names: {column_names}")

Number of rows: 346
Column names: ['Product Name', 'Price', 'Bag Size', 'Description', 'Ingredients', 'Guaranteed Analysis', 'Scraping Date', 'Product Link']


In [40]:
# Drop empty rows
dog_kibble_df.dropna(inplace=True)
num_rows_cleaned = dog_kibble_df.shape[0]

print(f"Number of rows after removing empty rows: {num_rows_cleaned}")

Number of rows after removing empty rows: 346


In [41]:
# Separate the Brand Name from the Kibble Name
brand_names = ["ACANA", "Barker's Complete", "Blue Buffalo", "Diamond Naturals", "Diamond Care", 
               "Gather", "Go! Solutions", "Hill's Science Diet", "Iams", "Lifetime", 
               "Merrick", "Natural Balance", "Now Fresh", "Nutro", "Open Farm", "ORIJEN", 
               "performatrin", "performatrin NATURALS", "performatrin Prime", 
               "performatrin Ultra", "Purina", "Purina Pro Plan", "Royal Canin", "Stella & Chewy's", 
               "Summit", "Taste of the Wild", "Wilder Harrier", "Zignature"]

# Create a pattern to match brand names
brand_pattern = '|'.join(brand_names)

# Extract brand and kibble name using regex
dog_kibble_df[['Brand', 'Kibble Name']] = dog_kibble_df['Product Name'].str.extract(f'({brand_pattern})\s?(.*)', expand=True)

### SEPARATES THE BRANDS EXCEPT FOR WHEN THE BRAND NAME REPEATS LIKE "performatrin", "performatrin NATURALS", "Purina", "Purina Pro Plan"

In [42]:
# Separate sub brands
sub_brand_names = ["NATURALS", "Prime", "Ultra", "Pro Plan", "Naturals", "Care"]

# Function to check if Kibble Name contains any sub-brand name
def extract_sub_brand(kibble_name):
    return next((sub_brand for sub_brand in sub_brand_names if isinstance(kibble_name, str) and sub_brand in kibble_name), None)

# Create 'Sub Brand' column
dog_kibble_df['Sub Brand'] = dog_kibble_df['Kibble Name'].apply(extract_sub_brand)

# Update 'Brand' column by joining with 'Sub Brand' if applicable
dog_kibble_df['Brand'] = np.where(~dog_kibble_df['Sub Brand'].isna(), dog_kibble_df['Brand'] + ' ' + dog_kibble_df['Sub Brand'], dog_kibble_df['Brand'])

# Remove sub-brand from 'Kibble Name'
def remove_sub_brand(kibble_name, sub_brand):
    if isinstance(kibble_name, str) and isinstance(sub_brand, str):
        return kibble_name.replace(sub_brand, '').strip()
    return kibble_name

dog_kibble_df['Kibble Name'] = dog_kibble_df.apply(lambda row: remove_sub_brand(row['Kibble Name'], row['Sub Brand']), axis=1)

# Drop unnecessary columns
dog_kibble_df.drop(['Product Name', 'Sub Brand'], axis=1, inplace=True)

In [43]:
# Extract the percentage of Crude Protein
dog_kibble_df['Crude Protein'] = dog_kibble_df['Guaranteed Analysis'].str.extract(r'Crude Protein\s*(.*?)%', expand=False, flags=re.IGNORECASE)

# Fill NaN values with an empty string
dog_kibble_df['Crude Protein'].fillna('', inplace=True)

# Separate Top 5 ingridients 
dog_kibble_df['TOP 5 Ingredients'] = dog_kibble_df['Ingredients'].apply(lambda x: ', '.join(str(x).split(', ')[:5]))

dog_kibble_df.head()

Unnamed: 0,Price,Bag Size,Description,Ingredients,Guaranteed Analysis,Scraping Date,Product Link,Brand,Kibble Name,Crude Protein,TOP 5 Ingredients
0,$112.99,24 lb,performatrin Ultra Limited Ingredient Kangaroo...,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils...",Crude Protein (min.) 24.0% Crude Fat (min.) 14...,2024-01-21,https://www.petvalu.ca/product/performatrin-ul...,performatrin Ultra,Limited Ingredient Kangaroo Recipe Adult Dog Food,(min.) 24.0,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils"
1,$86.99,24 lb,performatrin Ultra Wholesome Grains Lamb & Bro...,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice, Pe...",Crude Protein (min.) 22.0% Crude Fat (min.) 12...,2024-01-21,https://www.petvalu.ca/product/performatrin-ul...,performatrin Ultra,Wholesome Grains Lamb & Brown Rice Recipe Adul...,(min.) 22.0,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice"
2,$99.99,37 lb,performatrin Prime Chicken & Rice Formula Larg...,"Chicken, Chicken Meal (source of Glucosamine a...",Crude Protein (min.) 24.0% Crude Fat (min.) 12...,2024-01-21,https://www.petvalu.ca/product/performatrin-pr...,performatrin Prime,Chicken & Rice Formula Large Breed Adult Dog Food,(min.) 24.0,"Chicken, Chicken Meal (source of Glucosamine a..."
3,$115.99,24 lb,performatrin Ultra Limited Ingredient Sweet Po...,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",Crude Protein (min.) 21.0% Crude Fat (min.) 11...,2024-01-21,https://www.petvalu.ca/product/performatrin-ul...,performatrin Ultra,Limited Ingredient Sweet Potato & Venison Reci...,(min.) 21.0,"Sweet Potato, Venison, Peas, Pea Starch, Potat..."
4,$98.99,31.9 lb,ACANA Classics Red Meat Recipe Dog Food is a m...,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",Crude Protein 27% Fat Content 16% Crude Ash 8%...,2024-01-21,https://www.petvalu.ca/product/acana-classic-r...,ACANA,Classics Red Meat Recipe Dog Food,27,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle..."


In [44]:
#Rename and Reorder columns 
dog_kibble_df.rename(columns={'Ingredients': 'Full Ingredient List'}, inplace=True)

new_order = ['Brand', 'Kibble Name', 'Bag Size', 'Price', 'TOP 5 Ingredients', 'Crude Protein', 'Description',
             'Full Ingredient List', 'Guaranteed Analysis', 'Product Link', 'Scraping Date']

dog_kibble_df = dog_kibble_df[new_order]


In [45]:
dog_kibble_df.head(100)

Unnamed: 0,Brand,Kibble Name,Bag Size,Price,TOP 5 Ingredients,Crude Protein,Description,Full Ingredient List,Guaranteed Analysis,Product Link,Scraping Date
0,performatrin Ultra,Limited Ingredient Kangaroo Recipe Adult Dog Food,24 lb,$112.99,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils",(min.) 24.0,performatrin Ultra Limited Ingredient Kangaroo...,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils...",Crude Protein (min.) 24.0% Crude Fat (min.) 14...,https://www.petvalu.ca/product/performatrin-ul...,2024-01-21
1,performatrin Ultra,Wholesome Grains Lamb & Brown Rice Recipe Adul...,24 lb,$86.99,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice",(min.) 22.0,performatrin Ultra Wholesome Grains Lamb & Bro...,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice, Pe...",Crude Protein (min.) 22.0% Crude Fat (min.) 12...,https://www.petvalu.ca/product/performatrin-ul...,2024-01-21
2,performatrin Prime,Chicken & Rice Formula Large Breed Adult Dog Food,37 lb,$99.99,"Chicken, Chicken Meal (source of Glucosamine a...",(min.) 24.0,performatrin Prime Chicken & Rice Formula Larg...,"Chicken, Chicken Meal (source of Glucosamine a...",Crude Protein (min.) 24.0% Crude Fat (min.) 12...,https://www.petvalu.ca/product/performatrin-pr...,2024-01-21
3,performatrin Ultra,Limited Ingredient Sweet Potato & Venison Reci...,24 lb,$115.99,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",(min.) 21.0,performatrin Ultra Limited Ingredient Sweet Po...,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",Crude Protein (min.) 21.0% Crude Fat (min.) 11...,https://www.petvalu.ca/product/performatrin-ul...,2024-01-21
4,ACANA,Classics Red Meat Recipe Dog Food,31.9 lb,$98.99,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",27,ACANA Classics Red Meat Recipe Dog Food is a m...,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",Crude Protein 27% Fat Content 16% Crude Ash 8%...,https://www.petvalu.ca/product/acana-classic-r...,2024-01-21
...,...,...,...,...,...,...,...,...,...,...,...
95,Taste of the Wild,High Prairie Canine Formula Dog Food,28 lb,$97.99,"Buffalo, lamb meal, chicken meal, sweet potato...",32.0,A grain-free formula with sweet potatoes and p...,"Buffalo, lamb meal, chicken meal, sweet potato...",Crude Protein 32.0% minimum\nCrude Fat 18.0% m...,https://www.petvalu.ca/product/taste-of-the-wi...,2024-01-21
96,Merrick,Real Texas Beef + Sweet Potato Grain-Free Reci...,22 lb,$128.99,"Deboned Beef, Pork Meal, Salmon Meal, Sweet Po...",(min) 32,Merrick Real Texas Beef + Sweet Potato Grain-F...,"Deboned Beef, Pork Meal, Salmon Meal, Sweet Po...",Crude Protein (min) 32% Crude Fat (min) 15% Cr...,https://www.petvalu.ca/product/merrick-grain-f...,2024-01-21
97,Taste of the Wild,Sierra Mountain Canine Formula Dog Food,28 lb,$97.99,"Lamb, lamb meal, sweet potatoes, potatoes, peas",25.0,"A lamb protein, grain-free formula for all lif...","Lamb, lamb meal, sweet potatoes, potatoes, pea...",Crude Protein 25.0% minimum\nCrude Fat 15.0% m...,https://www.petvalu.ca/product/taste-of-the-wi...,2024-01-21
98,Stella & Chewy's,Raw Coated Beef Recipe Dog Food,22 lb,$99.99,"Beef, lamb meal, chickpeas, peas, salmon meal",(min) 32.0,Stella & Chewy's Baked Raw Coated Kibble is th...,"Beef, lamb meal, chickpeas, peas, salmon meal,...",Crude Protein (min) 32.0%\nCrude Fat (min) 15....,https://www.petvalu.ca/product/stella-chewys-r...,2024-01-21


In [46]:
# Remove duplicate rows
dog_kibble_df.drop_duplicates(inplace=True)

# Check for missing values in other columns
missing_values = dog_kibble_df.isnull().sum()
missing_values

Brand                   0
Kibble Name             0
Bag Size                0
Price                   0
TOP 5 Ingredients       0
Crude Protein           0
Description             0
Full Ingredient List    0
Guaranteed Analysis     0
Product Link            0
Scraping Date           0
dtype: int64

In [47]:
dog_kibble_df

Unnamed: 0,Brand,Kibble Name,Bag Size,Price,TOP 5 Ingredients,Crude Protein,Description,Full Ingredient List,Guaranteed Analysis,Product Link,Scraping Date
0,performatrin Ultra,Limited Ingredient Kangaroo Recipe Adult Dog Food,24 lb,$112.99,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils",(min.) 24.0,performatrin Ultra Limited Ingredient Kangaroo...,"Kangaroo, Peas, Chickpeas, Dried Peas, Lentils...",Crude Protein (min.) 24.0% Crude Fat (min.) 14...,https://www.petvalu.ca/product/performatrin-ul...,2024-01-21
1,performatrin Ultra,Wholesome Grains Lamb & Brown Rice Recipe Adul...,24 lb,$86.99,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice",(min.) 22.0,performatrin Ultra Wholesome Grains Lamb & Bro...,"Lamb, Lamb Meal, Brown Rice, Oatmeal, Rice, Pe...",Crude Protein (min.) 22.0% Crude Fat (min.) 12...,https://www.petvalu.ca/product/performatrin-ul...,2024-01-21
2,performatrin Prime,Chicken & Rice Formula Large Breed Adult Dog Food,37 lb,$99.99,"Chicken, Chicken Meal (source of Glucosamine a...",(min.) 24.0,performatrin Prime Chicken & Rice Formula Larg...,"Chicken, Chicken Meal (source of Glucosamine a...",Crude Protein (min.) 24.0% Crude Fat (min.) 12...,https://www.petvalu.ca/product/performatrin-pr...,2024-01-21
3,performatrin Ultra,Limited Ingredient Sweet Potato & Venison Reci...,24 lb,$115.99,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",(min.) 21.0,performatrin Ultra Limited Ingredient Sweet Po...,"Sweet Potato, Venison, Peas, Pea Starch, Potat...",Crude Protein (min.) 21.0% Crude Fat (min.) 11...,https://www.petvalu.ca/product/performatrin-ul...,2024-01-21
4,ACANA,Classics Red Meat Recipe Dog Food,31.9 lb,$98.99,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",27,ACANA Classics Red Meat Recipe Dog Food is a m...,"Raw Beef (16%), Lamb Meal (15%), Pearled Barle...",Crude Protein 27% Fat Content 16% Crude Ash 8%...,https://www.petvalu.ca/product/acana-classic-r...,2024-01-21
...,...,...,...,...,...,...,...,...,...,...,...
341,Blue Buffalo,Wilderness Chicken With Grain Small Breed Adul...,4.5 lb,$33.99,"Deboned Chicken, Chicken Meal, Dried Chicken, ...",36.0,Blue Buffalo Wilderness Chicken With Grain Sma...,"Deboned Chicken, Chicken Meal, Dried Chicken, ...",Crude Protein 36.0% min Crude Fat 16.0% min Cr...,https://www.petvalu.ca/product/blue-buffalo-wi...,2024-01-21
342,Now Fresh,"Grain-Free Turkey, Salmon & Duck Recipe Large ...",25 lb,$125.99,"De-boned turkey, potatoes, peas, whole dried e...",(min) 25,"Now Fresh Grain-Free Turkey, Salmon & Duck Rec...","De-boned turkey, potatoes, peas, whole dried e...",Crude Protein (min) 25% Crude Fat (min) 11% Cr...,https://www.petvalu.ca/product/now-fresh-grain...,2024-01-21
343,Royal Canin,Breed Health Nutrition Pug Puppy Dog Food,2.5 lb,$36.99,"Chicken by-product meal, brewers rice, corn, c...",(min.) 27.0,Royal Canin Breed Health Nutrition Pug Puppy D...,"Chicken by-product meal, brewers rice, corn, c...",Crude Protein (min.) 27.0% Crude Fat (min.) 16...,https://www.petvalu.ca/product/royal-canin-bre...,2024-01-21
344,Lifetime,Grain-Free Free Run Chicken Meal,25 lb,$63.99,"Chicken Meal, Peas, Northern White Beans, Chic...",(min) 29.0,"LIFETIME Grain-Free, Free Run Chicken Recipe A...","Chicken Meal, Peas, Northern White Beans, Chic...",Crude Protein(min) 29.0 % Crude Fat (min) 15.0...,https://www.petvalu.ca/product/lifetime-grain-...,2024-01-21


In [48]:
# Specify the file path to save the cleaned DataFrame
output_file_path = 'Resources/cleaned_dog_food_data.csv'

dog_kibble_df.to_csv(output_file_path, index=False)

print(f"Data saved to: {output_file_path}")

Data saved to: cleaned_dog_food_data.csv
