Import Libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from scipy import stats

Load the Dataset

In [None]:
# Load the dataset
dog_data = pd.read_csv("D:/dataset/Dog Breads Around The World.csv") 

# Structure of the dataset
print("Structure of the dataset:")
print(dog_data.info(), "\n")

# Display the first few rows of the dataset
dog_data.head()

Clean and Transform the Dataset

In [None]:
# Clean and transform the dataset
dog_data_clean = dog_data.copy()

# Convert relevant columns to appropriate data types
dog_data_clean['Name'] = dog_data_clean['Name'].astype('category')
dog_data_clean['Origin'] = dog_data_clean['Origin'].astype('category')
dog_data_clean['Type'] = dog_data_clean['Type'].astype('category')
dog_data_clean['Unique Feature'] = dog_data_clean['Unique Feature'].astype('category')
dog_data_clean['Size'] = pd.Categorical(dog_data_clean['Size'], categories=['Small', 'Medium', 'Large'], ordered=True)
dog_data_clean['Grooming Needs'] = pd.Categorical(dog_data_clean['Grooming Needs'], categories=['Low', 'Moderate', 'High', 'Very High'], ordered=True)
dog_data_clean['Good with Children'] = dog_data_clean['Good with Children'].astype('category')
dog_data_clean['Health Issues Risk'] = pd.Categorical(dog_data_clean['Health Issues Risk'], categories=['Low', 'Moderate', 'High'], ordered=True)
dog_data_clean['Exercise Requirements (hrs/day)'] = pd.to_numeric(dog_data_clean['Exercise Requirements (hrs/day)'], errors='coerce')
dog_data_clean['Average Weight (kg)'] = pd.to_numeric(dog_data_clean['Average Weight (kg)'], errors='coerce')
dog_data_clean['Training Difficulty (1-10)'] = pd.to_numeric(dog_data_clean['Training Difficulty (1-10)'], errors='coerce')
dog_data_clean['Friendly Rating (1-10)'] = pd.to_numeric(dog_data_clean['Friendly Rating (1-10)'], errors='coerce')
dog_data_clean['Intelligence Rating (1-10)'] = pd.to_numeric(dog_data_clean['Intelligence Rating (1-10)'], errors='coerce')
dog_data_clean['Shedding Level'] = pd.Categorical(dog_data_clean['Shedding Level'], categories=['Low', 'Moderate', 'High', 'Very High'], ordered=True)
dog_data_clean['Life Span'] = pd.to_numeric(dog_data_clean['Life Span'], errors='coerce')

# Drop rows with missing values
dog_data_clean.dropna(inplace=True)

# Display the cleaned data
dog_data_clean.head()

Summary Statistics for Numerical Columns

In [None]:
# Summary statistics for numerical columns
dog_data_clean.describe()

Frequency of Dog Breeds

In [None]:
# Frequency of dog breeds (or names)
breed_count = dog_data_clean['Name'].value_counts().reset_index(name='Count').rename(columns={'index': 'Breed'})
breed_count = breed_count.sort_values(by='Count', ascending=False)

# Display top 10 most common breeds
breed_count.head(10)

Bar Plot of Dog Breeds by Type

In [None]:
# Bar plot of dog breeds by Type using Plotly
fig = px.bar(dog_data_clean, x='Type', title='Distribution of Dog Breeds by Type', labels={'Type': 'Dog Type', 'count': 'Count'})
fig.update_layout(barmode='stack', xaxis_tickangle=-45)
fig.show()

Boxplot of Weight by Dog Size

In [None]:
# Boxplot of weight by dog size using Plotly
fig = px.box(dog_data_clean, x='Size', y='Average Weight (kg)', title="Boxplot of Dog Weight by Size", labels={'Size': 'Dog Size', 'Average Weight (kg)': 'Weight (kg)'})
fig.show()

Scatter Plot of Life Span vs Average Weight

In [None]:
# Scatter plot of life span vs average weight using Plotly
fig = px.scatter(dog_data_clean, x='Average Weight (kg)', y='Life Span', color='Size', title="Life Span vs Average Weight", labels={'Average Weight (kg)': 'Average Weight (kg)', 'Life Span': 'Life Span (years)'})
fig.show()

Histogram of Exercise Requirements (hours/day)

In [None]:
# Histogram of exercise requirements (hours/day) using Plotly
fig = px.histogram(dog_data_clean, x='Exercise Requirements (hrs/day)', nbins=20, title="Distribution of Exercise Requirements", labels={'Exercise Requirements (hrs/day)': 'Exercise Requirements (hrs/day)'})
fig.show()

 Bar Plot of Grooming Needs

In [None]:
# Bar plot of grooming needs using Plotly
fig = px.bar(dog_data_clean, x='Grooming Needs', title="Distribution of Grooming Needs", labels={'Grooming Needs': 'Grooming Needs'})
fig.show()

Correlation Matrix for Numerical Variables

In [None]:
# Correlation matrix for relevant numerical variables
cor_data = dog_data_clean[['Life Span', 'Friendly Rating (1-10)', 'Training Difficulty (1-10)', 'Average Weight (kg)']]
cor_matrix = cor_data.corr()

# Display the correlation matrix
print(cor_matrix)

ANOVA for Weight Across Grooming Needs

In [None]:
# ANOVA for weight across grooming needs
anova_result = stats.f_oneway(
    dog_data_clean[dog_data_clean['Grooming Needs'] == 'Low']['Average Weight (kg)'],
    dog_data_clean[dog_data_clean['Grooming Needs'] == 'Moderate']['Average Weight (kg)'],
    dog_data_clean[dog_data_clean['Grooming Needs'] == 'High']['Average Weight (kg)'],
    dog_data_clean[dog_data_clean['Grooming Needs'] == 'Very High']['Average Weight (kg)']
)

# Display the ANOVA result
print("ANOVA result for weight across grooming needs:")
print(f"F-statistic: {anova_result.statistic}")
print(f"P-value: {anova_result.pvalue}")

Visualize the ANOVA Result

In [None]:
# Visualize the ANOVA result using Plotly
fig = px.box(dog_data_clean, x='Grooming Needs', y='Average Weight (kg)', title="Weight Across Grooming Needs", labels={'Grooming Needs': 'Grooming Needs', 'Average Weight (kg)': 'Weight (kg)'})
fig.show()

Summary of Characteristics by Dog Size

In [None]:
# Summary of characteristics by dog size
size_comparison = dog_data_clean.groupby('Size').agg(
    Mean_Weight=('Average Weight (kg)', 'mean'),
    Median_Weight=('Average Weight (kg)', 'median'),
    Mean_LifeSpan=('Life Span', 'mean'),
    Median_LifeSpan=('Life Span', 'median')
).reset_index()

# View the summary
print(size_comparison)