#Final Project - Data Gathering and Warehousing - DSSA-5102#
Instructor: Melissa Laurino
Spring 2025

##Data Cleaning##
Let's load our data, clean it up and prepare it for use in our final project.

In [2]:
# Load necessary packages:
import pandas as pd
import numpy as np
import os

In [3]:
# load AKC csv file
akc_dogs_df = pd.read_csv('datasets/akc-data-latest.csv')
# load Pet Owners file
pet_owners_df = pd.read_csv('datasets/pet-owners.csv')

# Breed Dataset

Let's clean up the breed dataset

In [4]:
# rename the first column to breed
akc_dogs_df.columns.values[0] = "breed"
akc_dogs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   breed                        277 non-null    object 
 1   description                  277 non-null    object 
 2   temperament                  276 non-null    object 
 3   popularity                   198 non-null    object 
 4   min_height                   277 non-null    float64
 5   max_height                   277 non-null    float64
 6   min_weight                   275 non-null    float64
 7   max_weight                   275 non-null    float64
 8   min_expectancy               274 non-null    float64
 9   max_expectancy               274 non-null    float64
 10  group                        277 non-null    object 
 11  grooming_frequency_value     270 non-null    float64
 12  grooming_frequency_category  270 non-null    object 
 13  shedding_value      

In [5]:
# Drop the columns we will not be using
akc_dogs_df = akc_dogs_df.drop(columns=['description', 'group', 'grooming_frequency_value', 'grooming_frequency_category', 'shedding_value', 'shedding_category'])

In [6]:
# Preview our dataframe after the column drops
akc_dogs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   breed                  277 non-null    object 
 1   temperament            276 non-null    object 
 2   popularity             198 non-null    object 
 3   min_height             277 non-null    float64
 4   max_height             277 non-null    float64
 5   min_weight             275 non-null    float64
 6   max_weight             275 non-null    float64
 7   min_expectancy         274 non-null    float64
 8   max_expectancy         274 non-null    float64
 9   energy_level_value     271 non-null    float64
 10  energy_level_category  271 non-null    object 
 11  trainability_value     253 non-null    float64
 12  trainability_category  253 non-null    object 
 13  demeanor_value         252 non-null    float64
 14  demeanor_category      252 non-null    object 
dtypes: flo

In [7]:
# let's look for any rows with missing values
rows_with_missing_values = akc_dogs_df[akc_dogs_df.isna().any(axis=1)]
print(rows_with_missing_values)

                         breed                           temperament  \
5             American Bulldog                 Loyal, Self-Confident   
9    American Hairless Terrier             Energetic, Alert, Curious   
10      American Leopard Hound      Sociable, Energetic, Intelligent   
14      Appenzeller Sennenhund              Agile, Versatile, Lively   
16           Australian Kelpie             Loyal, Alert, Intelligent   
..                         ...                                   ...   
260        Transylvanian Hound  Courageous, Good-Natured, Determined   
261  Treeing Tennessee Brindle          Friendly, Alert, Intelligent   
268                 Wetterhoun      Loyal, Good-Natured, Intelligent   
273             Working Kelpie             Alert, Eager, Intelligent   
275             Yakutian Laika     Affectionate, Intelligent, Active   

    popularity  min_height  max_height  min_weight  max_weight  \
5          NaN       50.80       63.50   27.215542   45.359237   
9  

In [8]:
# Replace the NaN values in object type columns with None, that seems to work better with MySQL
# set the object type columns
cols_to_replace = ['energy_level_category', 'trainability_category', 'demeanor_category']

# Replace NaN with None in the specified columns
akc_dogs_df[cols_to_replace] = akc_dogs_df[cols_to_replace].where(pd.notna(akc_dogs_df[cols_to_replace]), None)

In [9]:
akc_dogs_df['popularity'].unique()

array(['148', '113', '60', '47', '58', nan, '175', '122', '186', '136',
       '85', '166', '90', '55', '17', '140', 'of', '87', '39', '6', '127',
       '124', '141', '43', '125', '106', '187', '144', '22', '46', '138',
       '118', '49', '130', '121', '35', '88', '103', '21', '84', '11',
       '100', '132', '26', '98', '62', '5', '51', '69', '179', '32', '68',
       '18', '185', '45', '33', '79', '64', '190', '75', '183', '143',
       '30', '38', '81', '162', '12', '56', '176', '67', '52', '188',
       '94', '27', '135', '157', '149', '161', '184', '91', '4', '134',
       '2', '9', '63', '78', '174', '3', '115', '177', '16', '66', '74',
       '145', '189', '24', '152', '155', '146', '77', '116', '159', '76',
       '73', '104', '95', '129', '173', '163', '1', '99', '147', '93',
       '71', '168', '37', '133', '29', '34', '110', '70', '19', '102',
       '150', '40', '126', '165', '97', '191', '108', '83', '72', '182',
       '54', '117', '92', '13', '156', '172', '171', '114'

In [10]:
# Convert popularity column to numeric, then replace NaN in the popularity column with 0
akc_dogs_df['popularity'] = pd.to_numeric(akc_dogs_df['popularity'], errors='coerce')
akc_dogs_df['popularity'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  akc_dogs_df['popularity'].fillna(0, inplace=True)


In [11]:
akc_dogs_df['popularity'].unique()
#akc_dogs_df.info()

array([148., 113.,  60.,  47.,  58.,   0., 175., 122., 186., 136.,  85.,
       166.,  90.,  55.,  17., 140.,  87.,  39.,   6., 127., 124., 141.,
        43., 125., 106., 187., 144.,  22.,  46., 138., 118.,  49., 130.,
       121.,  35.,  88., 103.,  21.,  84.,  11., 100., 132.,  26.,  98.,
        62.,   5.,  51.,  69., 179.,  32.,  68.,  18., 185.,  45.,  33.,
        79.,  64., 190.,  75., 183., 143.,  30.,  38.,  81., 162.,  12.,
        56., 176.,  67.,  52., 188.,  94.,  27., 135., 157., 149., 161.,
       184.,  91.,   4., 134.,   2.,   9.,  63.,  78., 174.,   3., 115.,
       177.,  16.,  66.,  74., 145., 189.,  24., 152., 155., 146.,  77.,
       116., 159.,  76.,  73., 104.,  95., 129., 173., 163.,   1.,  99.,
       147.,  93.,  71., 168.,  37., 133.,  29.,  34., 110.,  70.,  19.,
       102., 150.,  40., 126., 165.,  97., 191., 108.,  83.,  72., 182.,
        54., 117.,  92.,  13., 156., 172., 171., 114., 170.,  23.,   7.,
       154.,  50.,  28., 160., 151., 181.,  86., 14

In [12]:
# let's see the max values in max_height, max_weight, and max_expectancy columns to get an idea of the data range in them
maxAkcValues = akc_dogs_df[['max_height', 'max_weight', 'max_expectancy']].max()
maxAkcValues

max_height         88.900000
max_weight        108.862169
max_expectancy     19.000000
dtype: float64

In [13]:
# let's do a final cleanup in case we missed any empty values
akc_dogs_df = akc_dogs_df.fillna({
    'breed': 'Unknown',
    'temperament': 'Unknown',
    'popularity': 0,
    'min_height': 0,
    'max_height': 0,
    'min_weight': 0,
    'max_weight': 0,
    'min_expectancy': 0,
    'max_expectancy': 0,
    'energy_level_value': 0,
    'energy_level_category': 'Unknown',
    'trainability_value': 0,
    'trainability_category': 'Unknown',
    'demeanor_value': 0,
    'demeanor_category': 'Unknown'
})

In [57]:
# Export the akc_dogs_df dataset to a .csv file
akc_dogs_df.to_csv('datasets/dog_breed_clean.csv', index=False, sep=',', encoding='utf-8')

# Pet Dataset

Let's clean up the pet dataset

In [14]:
# preview of the pet owners data
pet_owners_df.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A882831,*Hamilton,07/01/2023 06:12:00 PM,Jul 2023,03/25/2023,Adoption,,Cat,Neutered Male,3 months,Domestic Shorthair Mix,Black/White
1,A794011,Chunk,05/08/2019 06:20:00 PM,May 2019,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
2,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
3,A821648,,08/16/2020 11:38:00 AM,Aug 2020,08/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
4,A720371,Moose,02/13/2016 05:59:00 PM,Feb 2016,10/08/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff


In [15]:
print(pet_owners_df)

       Animal ID       Name                DateTime MonthYear Date of Birth  \
0        A882831  *Hamilton  07/01/2023 06:12:00 PM  Jul 2023    03/25/2023   
1        A794011      Chunk  05/08/2019 06:20:00 PM  May 2019    05/02/2017   
2        A776359      Gizmo  07/18/2018 04:02:00 PM  Jul 2018    07/12/2017   
3        A821648        NaN  08/16/2020 11:38:00 AM  Aug 2020    08/16/2019   
4        A720371      Moose  02/13/2016 05:59:00 PM  Feb 2016    10/08/2015   
...          ...        ...                     ...       ...           ...   
172595   A926855   *Eleanor  03/24/2025 04:30:00 PM  Mar 2025    11/18/2024   
172596   A926900     *Ponzu  03/24/2025 05:34:00 PM  Mar 2025    03/18/2023   
172597   A925467    *Oliver  03/24/2025 05:24:00 PM  Mar 2025    02/24/2023   
172598   A926030    Kashmir  03/24/2025 05:49:00 PM  Mar 2025    03/04/2024   
172599   A926980        NaN  03/24/2025 05:44:00 PM  Mar 2025    01/13/2025   

       Outcome Type Outcome Subtype Animal Type Sex

In [16]:
# Number of AKC dog breeds
len(akc_dogs_df['breed'].unique())

277

In [17]:
# Filter pet_owners_df to be only Animal Type of Dog
dog_pet_owners_df = pet_owners_df[pet_owners_df['Animal Type'] == 'Dog']

In [18]:
# Get an idea of how what kind of unique breed values are in pet
dog_pet_owners_df['Breed'].unique()

array(['Chihuahua Shorthair Mix', 'Anatol Shepherd/Labrador Retriever',
       'American Foxhound/Labrador Retriever', ..., 'Pit Bull/Bloodhound',
       'Shih Tzu/Cairn Terrier', 'French Bulldog/Pomeranian'],
      dtype=object)

In [19]:
# Assign all the dog breeds that are of mix to pet_contains_slash_df
pet_contains_slash_df = dog_pet_owners_df[dog_pet_owners_df['Breed'].str.contains('Mix', na=False)]
pet_contains_slash_df

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
2,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
6,A659412,Princess,10/05/2020 02:37:00 PM,Oct 2020,03/24/2013,Adoption,,Dog,Spayed Female,7 years,Chihuahua Shorthair Mix,Brown
17,A843327,*Mary,10/08/2021 01:25:00 PM,Oct 2021,09/29/2019,Transfer,Out State,Dog,Intact Female,2 years,Chihuahua Shorthair Mix,Black/White
20,A595061,Gypsy,08/18/2022 06:07:00 PM,Aug 2022,10/01/2010,Return to Owner,,Dog,Spayed Female,11 years,Labrador Retriever Mix,Red/White
27,A856853,Jameson,05/20/2022 03:32:00 PM,May 2022,11/09/2021,Adoption,,Dog,Neutered Male,6 months,Black Mouth Cur Mix,Brown/Black
...,...,...,...,...,...,...,...,...,...,...,...,...
172565,A737625,*Lt Winters,03/23/2025 06:06:00 PM,Mar 2025,05/01/2015,Adoption,,Dog,Neutered Male,9 years,Pit Bull Mix,White
172574,A856897,Junior,05/12/2022 04:01:00 PM,May 2022,03/09/2022,Adoption,,Dog,Neutered Male,2 months,German Shepherd Mix,Tricolor
172586,A926712,*Rocket Man,03/24/2025 04:19:00 PM,Mar 2025,09/15/2024,Adoption,,Dog,Neutered Male,6 months,Carolina Dog Mix,Red/White
172598,A926030,Kashmir,03/24/2025 05:49:00 PM,Mar 2025,03/04/2024,Adoption,,Dog,Spayed Female,1 year,Labrador Retriever Mix,Black/White


59571 rows cotain " Mix" with the dog breed. We're going to remove " Mix" if the aditional breed wasn't provided with the dataset.

In [20]:
# Strip " Mix" from the Breed column
dog_pet_owners_df.loc[:, 'Breed'] = dog_pet_owners_df['Breed'].str.rstrip(' Mix')

In [21]:
# See how many breed matches are between the akc and pets datasets
breed_matches_df = dog_pet_owners_df['Breed'].isin(akc_dogs_df['breed']).sum()
print(f"Toal Pet Owners: {len(dog_pet_owners_df)}")
print(f"Total Pet Owners with AKC Breed Matches: {breed_matches_df}")

Toal Pet Owners: 93978
Total Pet Owners with AKC Breed Matches: 38879


In [22]:
dog_pet_owners_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93978 entries, 2 to 172599
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Animal ID         93978 non-null  object
 1   Name              79208 non-null  object
 2   DateTime          93978 non-null  object
 3   MonthYear         93978 non-null  object
 4   Date of Birth     93978 non-null  object
 5   Outcome Type      93951 non-null  object
 6   Outcome Subtype   31146 non-null  object
 7   Animal Type       93978 non-null  object
 8   Sex upon Outcome  93976 non-null  object
 9   Age upon Outcome  93974 non-null  object
 10  Breed             93978 non-null  object
 11  Color             93978 non-null  object
dtypes: object(12)
memory usage: 9.3+ MB


In [23]:
# See the unique values of Outcome Type column
dog_pet_owners_df['Outcome Type'].unique()

array(['Adoption', 'Return to Owner', 'Transfer', 'Rto-Adopt',
       'Euthanasia', 'Died', 'Missing', nan, 'Disposal', 'Stolen', 'Lost',
       'Relocate'], dtype=object)

In [24]:
# Create pet_trim_df dataframe with just the columns we want from dog_pet_owners_df
pet_trim_df = dog_pet_owners_df.loc[:, ['Name', 'Date of Birth', 'Outcome Type', 'Outcome Subtype', 'DateTime', 'Sex upon Outcome', 'Color', 'Age upon Outcome', 'Breed']]

In [25]:
# Rename the columns to be all lowercase with no spaces
pet_trim_df.rename(columns={'Name': 'name', 'Date of Birth': 'dob', 'Outcome Type': 'outcome_type', 'Outcome Subtype': 'outcome_subtype', 'DateTime': 'outcome_datetime', 'Sex upon Outcome': 'sex', 'Color': 'color', 'Age upon Outcome': 'age', 'Breed': 'breed'}, inplace=True)
pet_trim_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93978 entries, 2 to 172599
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              79208 non-null  object
 1   dob               93978 non-null  object
 2   outcome_type      93951 non-null  object
 3   outcome_subtype   31146 non-null  object
 4   outcome_datetime  93978 non-null  object
 5   sex               93976 non-null  object
 6   color             93978 non-null  object
 7   age               93974 non-null  object
 8   breed             93978 non-null  object
dtypes: object(9)
memory usage: 7.2+ MB


In [26]:
# let's look for any rows with missing values
pet_rows_with_missing_values = pet_trim_df[pet_trim_df.isna().any(axis=1)]
print(pet_rows_with_missing_values)

               name         dob     outcome_type outcome_subtype  \
2             Gizmo  07/12/2017         Adoption             NaN   
4             Moose  10/08/2015         Adoption             NaN   
6          Princess  03/24/2013         Adoption             NaN   
12            Tulip  08/06/2019         Adoption             NaN   
16            Fiona  06/01/2018  Return to Owner             NaN   
...             ...         ...              ...             ...   
172574       Junior  03/09/2022         Adoption             NaN   
172586  *Rocket Man  09/15/2024         Adoption             NaN   
172591   *Charlotte  12/02/2024         Adoption             NaN   
172598      Kashmir  03/04/2024         Adoption             NaN   
172599          NaN  01/13/2025         Adoption             NaN   

              outcome_datetime            sex        color       age  \
2       07/18/2018 04:02:00 PM  Neutered Male  White/Brown    1 year   
4       02/13/2016 05:59:00 PM  Neutere

In [27]:
# Let's replace the NaN values for the name with Unknown
pet_trim_df['name'] = pet_trim_df['name'].replace(np.nan, 'Unknown')

In [28]:
# Let's replace the NaN values for the outcome_type with None
pet_trim_df['outcome_type'].fillna('None', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  pet_trim_df['outcome_type'].fillna('None', inplace=True)


In [29]:
# Drop any remaining rows with NaN values
pet_trim_df.dropna(inplace=True)

In [30]:
# Convert dob to MySQL friendly date format
pet_trim_df['dob'] = pd.to_datetime(pet_trim_df['dob'])
pet_trim_df['dob'] = pet_trim_df['dob'].dt.strftime('%Y-%m-%d')

In [31]:
# Convert outcome_datetime to MySQL friendly date/time format
pet_trim_df['outcome_datetime'] = pd.to_datetime(pet_trim_df['outcome_datetime'])
pet_trim_df['outcome_datetime'] = pet_trim_df['outcome_datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [32]:
#Let's keep only rows where pet breed matches the akc_dogs breed
akc_breeds = akc_dogs_df['breed'].unique()
filtered_pet_trim_df = pet_trim_df[pet_trim_df['breed'].isin(akc_breeds)]
len(filtered_pet_trim_df) # output number of clean breed rows

13125

In [38]:
# let's do a final cleanup in case we missed any empty values
filtered_pet_trim_df = filtered_pet_trim_df.fillna({
    'name': 'Unknown',
    'outcome_type': 'Unknown',
    'outcome_subtype': 'Unknown',
    'sex': 'Unknown',
    'color': 'Unknown',
    'age': 'Unknown',
    'breed': 'Unknown'
})

In [40]:
# Export the filtered_pet_trim_df dataset to a .csv file
filtered_pet_trim_df.to_csv('datasets/pet_clean.csv', index=False, sep=',', encoding='utf-8')

##Start MySQL Portion

In [33]:
# Load necessary packages:
from sqlalchemy import create_engine, Column, String, Integer, Boolean, BigInteger, Float, text # Database navigation
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import insert
import mysql.connector

In [37]:
# let's look for any rows with missing values
pet_rows_with_missing_values = pet_trim_df[pet_trim_df.isna().any(axis=1)]
print(pet_rows_with_missing_values)

Empty DataFrame
Columns: [name, dob, outcome_type, outcome_subtype, outcome_datetime, sex, color, age, breed]
Index: []


In [34]:
#define mysql connection variables
conn = mysql.connector.connect(
        host="localhost", # This is my local instance number when you open MySQL Workbench.
        user="root", # This is my username for MySQL Workbench
        password="karateChop") # We wrote this password down in our first class!

# Create a cursor object using the cursor() method
cursor = conn.cursor()

# CREATE DATABASE if it does not already exist for assignment 7
cursor.execute("CREATE DATABASE IF NOT EXISTS pet_puffs")

In [35]:
# Time to connect to the database using SQL Alchemy:
DATABASE_URL = "mysql+mysqlconnector://root:karateChop@localhost/pet_puffs" # Use MySQL Connector to connect to the database
engine = create_engine(DATABASE_URL) # Creates a connection to the MySQL database

print("Connected to MySQL database successfully!")
# I'm really not sure why I picked karateChop as a password, must've been something I saw that day. Glad I wrote it down!

Connected to MySQL database successfully!


In [None]:
Create the database tables

In [36]:
cursor = conn.cursor()

# let's create our dog_breed table
create_breed_table_query = """CREATE TABLE IF NOT EXISTS dog_breed (
                              id INT AUTO_INCREMENT PRIMARY KEY,
                              breed VARCHAR(255),
                              temperament VARCHAR(255),
                              popularity FLOAT,
                              min_height FLOAT,
                              max_height FLOAT,
                              min_weight FLOAT,
                              max_weight FLOAT,
                              min_expectancy FLOAT,
                              max_expectancy FLOAT,
                              energy_level_value FLOAT,
                              energy_level_category VARCHAR(255),
                              trainability_value FLOAT,
                              trainability_category VARCHAR(255),
                              demeanor_value FLOAT,
                              demeanor_category VARCHAR(255),
                              created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                              updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
                              );
                           """
# Execute the create_breed_table_query
with engine.connect() as connection:
    connection.execute(text(create_breed_table_query))

print("create_breed_table_query table created successfully!") # output this string if the query worked # run our query to create the dog_breed table

create_breed_table_query table created successfully!


In [37]:
cursor = conn.cursor()

# let's create our pet table
create_pet_table_query = """CREATE TABLE IF NOT EXISTS pet (
                              id INT AUTO_INCREMENT PRIMARY KEY,
                              name VARCHAR(255),
                              dob DATE,
                              sex VARCHAR(255),
                              color VARCHAR(255),
                              age VARCHAR(255),
                              created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                              updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
                              dog_breed_id INT,
                              FOREIGN KEY (dog_breed_id) REFERENCES dog_breed(id)
                              );
                           """
# Execute the create_breed_table_query
with engine.connect() as connection:
    connection.execute(text(create_pet_table_query))

# let's create our pet_outcome table
create_pet_outcome_table_query = """CREATE TABLE IF NOT EXISTS pet_outcome (
                              id INT AUTO_INCREMENT PRIMARY KEY,
                              outcome_type VARCHAR(255),
                              outcome_subtype VARCHAR(255),
                              outcome_datetime DATETIME,
                              created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                              updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
                              pet_id INT,
                              FOREIGN KEY (pet_id) REFERENCES pet(id)
                              );
                           """
# Execute the create_breed_table_query
with engine.connect() as connection:
    connection.execute(text(create_pet_outcome_table_query))

print("created our pet and pet_outcome tables created successfully!") # output this string if the query worked # run our query to create the dog_breed table

created our pet and pet_outcome tables created successfully!


Insert our data into the dog_breed, pet, and pet_outcome tables

In [58]:
cursor.execute("USE pet_puffs;")  # specify the pet_puffs database

# for look to go through each row of the akc_dogs_df
for index, dog in akc_dogs_df.iterrows():

  # INSERT into dog_breed table
  cursor.execute("""INSERT INTO dog_breed (
                      breed,
                      temperament,
                      popularity,
                      min_height,
                      max_height,
                      min_weight,
                      max_weight,
                      min_expectancy,
                      max_expectancy,
                      energy_level_value,
                      energy_level_category,
                      trainability_value,
                      trainability_category,
                      demeanor_value,
                      demeanor_category
                      )
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                 """, (
                      dog['breed'],
                      dog['temperament'],
                      dog['popularity'],
                      dog['min_height'],
                      dog['max_height'],
                      dog['min_weight'],
                      dog['max_weight'],
                      dog['min_expectancy'],
                      dog['max_expectancy'],
                      dog['energy_level_value'],
                      dog['energy_level_category'],
                      dog['trainability_value'],
                      dog['trainability_category'],
                      dog['demeanor_value'],
                      dog['demeanor_category']
                 ))

# Commit the transaction
conn.commit()

print("dog_breed table is populated")

dog_breed table is populated


In [42]:
cursor.execute("USE pet_puffs;")  # specify the pet_puffs database

# Fairly heavy loop and conditional logic for the pet and pet_outcome INSERTS
for index, pet in filtered_pet_trim_df.iterrows():
    current_breed = pet['breed']  # set the current breed to current_breed
    # Run the SQL to get the id of the breed that matches the current_breed
    cursor.execute("""SELECT id
                      FROM dog_breed
                      WHERE breed = %s""", (current_breed,))  # convert to tuple

    breed_id = cursor.fetchone()  # output the dog_breed id to the breed_id variable
    # Check if breed_id is found, otherwise insert None
    breed_id = breed_id[0] if breed_id else None

    # Run the INSERT into pet table
    cursor.execute("""INSERT INTO pet (
                        name,
                        dob,
                        sex,
                        color,
                        age,
                        dog_breed_id
                      )
                      VALUES (%s, %s, %s, %s, %s, %s)
                   """, (
                        pet['name'],
                        pet['dob'],
                        pet['sex'],
                        pet['color'],
                        pet['age'],
                        breed_id
                  ))

    # Get the last inserted pet ID
    pet_id = cursor.lastrowid
    print(pet_id)

    # If pet_outcome is not equal to None, then insert a record into the pet_outcome table
    if pet['outcome_type'] != 'None':
        cursor.execute("""INSERT INTO pet_outcome (
                            outcome_type,
                            outcome_subtype,
                            outcome_datetime,
                            pet_id
                          )
                          VALUES (%s, %s, %s, %s)
                       """, (
                            pet['outcome_type'],
                            pet['outcome_subtype'],
                            pet['outcome_datetime'],
                            pet_id
                       ))

# Commit the transaction
conn.commit()

print("pet and pet_outcome table is populated")


InternalError: Unread result found

In [None]:
#Close the database connection :)
# mycursor.close()
# conn.close()

Let's populate the pet and pet_outcome tables. This will be a little tricky with the loops and conditionals, so we'll practice the the logic first.

In [None]:
# Create and start SSH tunnel to populate the pet and pet_outcome tables
with SSHTunnelForwarder(
    (ssh_host, ssh_port),
    ssh_username=ssh_username,
    ssh_password=ssh_password,
    remote_bind_address=(mysql_host, mysql_port),
) as server:
    print(f"SSH tunnel established on port {server.local_bind_port}")
    # MySQL connection must go within the with

    # Connect to MySQL using SSH tunnel
    conn = mysql.connector.connect(
            host="127.0.0.1", # Use localhost since we're connected via SSH
            port=server.local_bind_port,
            user=mysql_username,
            password=mysql_password,
            database="data_warehousing_final",
        )

    cursor = conn.cursor()

    # Test the loop and conditional logic for the INSERTS
    for index, pet in pet_trim_df.iterrows():
        current_breed = pet['breed'] # set the current breed to current_breed
        # Run the SQL to get the id of the breed that matches the current_breed
        cursor.execute("""SELECT id
                        FROM dog_breed
                        WHERE breed = %s""", (
                            (current_breed,) # convert to tuple
                        ))
        breed_id = cursor.fetchone() # output the dog_breed id to the breed_id variable
        # Check if breed_id is found, otherwise insert None
        breed_id = breed_id[0] if breed_id else None

        # Run the INSERT into pet table
        cursor.execute("""INSERT IGNORE INTO pet (
                            name,
                            dob,
                            sex,
                            color,
                            age,
                            dog_breed_id
                            )
                          VALUES (%s, %s, %s, %s, %s, %s)
                       """, (
                            pet['name'],
                            pet['dob'],
                            pet['sex'],
                            pet['color'],
                            pet['age'],
                            breed_id
                      ))

        # Get the last inserted pet ID
        pet_id = cursor.lastrowid
        print(pet_id)

        # If pet_outcome is not equal to None, then insert a record into the pet_outcome table
        if pet['outcome_type'] != 'None':
            cursor.execute("""INSERT IGNORE INTO pet_outcome (
                                outcome_type,
                                outcome_subtype,
                                outcome_datetime,
                                pet_id
                                )
                              VALUES (%s, %s, %s, %s)
                           """, (
                                pet['outcome_type'],
                                pet['outcome_subtype'],
                                pet['outcome_datetime'],
                                pet_id
                           ))

    conn.commit() # Commit the changes after the loop
    conn.close() # We always close the db of course!

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
40579
40580
40581
40582
40583
40584
40585
40586
40587
40588
40589
40590
40591
40592
40593
40594
40595
40596
40597
40598
40599
40600
40601
40602
40603
40604
40605
40606
40607
40608
40609
40610
40611
40612
40613
40614
40615
40616
40617
40618
40619
40620
40621
40622
40623
40624
40625
40626
40627
40628
40629
40630
40631
40632
40633
40634
40635
40636
40637
40638
40639
40640
40641
40642
40643
40644
40645
40646
40647
40648
40649
40650
40651
40652
40653
40654
40655
40656
40657
40658
40659
40660
40661
40662
40663
40664
40665
40666
40667
40668
40669
40670
40671
40672
40673
40674
40675
40676
40677
40678
40679
40680
40681
40682
40683
40684
40685
40686
40687
40688
40689
40690
40691
40692
40693
40694
40695
40696
40697
40698
40699
40700
40701
40702
40703
40704
40705
40706
40707
40708
40709
40710
40711
40712
40713
40714
40715
40716
40717
40718
40719
40720
40721
40722
40723
40724
40725
40726
40727
40728
40729
40730
40731
40732
40733
40734