In [5]:
# Canadian Cheese & Climate Analysis  
# Objective:** Explore how province‐level temperature relates to cheese production and diversity using Kaggle’s Cheese Directory and Weather datasets.

In [20]:
# Fix for pandasgui.show() in recent IPython versions
import IPython
ip = IPython.get_ipython()
if ip is not None and not hasattr(ip, "magic"):
    # alias .magic() to .run_line_magic() so pandasgui can call it
    ip.magic = ip.run_line_magic

# Turn on the Qt5 event loop for PandasGUI
ip.run_line_magic("gui", "qt5")


In [6]:
# Importing neccessary libraries
import webbrowser
import pandas as pd
from IPython.display import display
import plotly.express as px

# # Fallback for pandasgui.show if it isn't installed
# try:
#     from pandasgui import show
# except ImportError:
#     def show(df):
#         print(df.head())

In [7]:
# Loading the CSV's and creating dataframes
df = pd.read_csv("cheese_data.csv")
new_df = pd.read_csv("canada_weather.csv")

In [8]:
# Cleaning and extracting the relevant information
df_rows = []
for _, row in df.iterrows():
    province = str(row.iloc[1]).strip()
    moisture_percent = float(row.iloc[3])
    organic = float(row.iloc[6])
    category_type = str(row.iloc[7]).strip()
    df_rows.append({
        "Province": province,
        "Moisture Percent": moisture_percent,
        "Organic": organic,
        "Type of Cheese": category_type
    })
cheese_df = pd.DataFrame(df_rows)

In [9]:
# Aggregating the cheese statistics
agg_df = cheese_df.groupby('Province').agg({
    
    'Moisture Percent': 'mean',
    'Organic': 'sum'
})
cheese_counts = pd.crosstab(cheese_df['Province'], cheese_df['Type of Cheese'])
final_df = pd.concat([agg_df, cheese_counts], axis=1)
final_df = final_df.rename(columns={final_df.columns[-1]: 'Other'})
cheese_cols = final_df.columns.difference(['Moisture Percent', 'Organic'])
final_df['Num_Cheese_Types'] = final_df[cheese_cols].gt(0).sum(axis=1)
final_df['Total_Cheese_Produced'] = final_df[cheese_cols].sum(axis=1)

In [10]:
# Creating a function to clean filter out unicode characters in the weather data
def clean_number(s):
    return float(str(s).strip()
                   .replace('−', '-')
                   .replace('–', '-')
                   .replace(',', '.'))

# Organizing the information and loading it to a new dataframe
rows = []
for _, row in new_df.iterrows():
    high_stat_str = str(row.iloc[8])
    low_stat_str = str(row.iloc[9])
    province = row['Community'].split(',')[-1].strip()

    high_c = clean_number(high_stat_str.split('(')[0])
    high_f = clean_number(high_stat_str.split('(')[-1].rstrip(')'))
    low_c  = clean_number(low_stat_str.split('(')[0])
    low_f  = clean_number(low_stat_str.split('(')[-1].rstrip(')'))

    rows.append({
        "Province": province,
        "High_Celsius": high_c,
        "High_Farenheit": high_f,
        "Low_Celsius": low_c,
        "Low_Farenheit": low_f
    })

temp_df = pd.DataFrame(rows)

In [11]:
# Calculating the average temperatures
avg_by_province = temp_df.groupby('Province').mean(numeric_only=True)
avg_by_province['Avg Celsius'] = (avg_by_province['High_Celsius'] + avg_by_province['Low_Celsius']) / 2
avg_by_province['Avg Farenheit'] = (avg_by_province['High_Farenheit'] + avg_by_province['Low_Farenheit']) / 2

In [12]:
# Merging the cheese and weather dataframe by matching it with the province
combined_df = pd.merge(
    final_df.reset_index(), 
    avg_by_province.reset_index(), 
    on='Province', 
    how='inner'
)
combined_df.head()

# Checking if there are any missing columns
na_counts = combined_df.isna().sum()
missing = na_counts[na_counts > 0]

if not missing.empty:
    print("Missing values detected:")
    display(missing)
else:
    pass

In [None]:
## Visualization 1: Avg Temperature vs. Cheese Production

#This scatterplot shows average temperature per province vs total cheese production.  
# - **X-axis**: Average Temperature (°C)  
# - **Y-axis**: Total Cheese Produced  
# - **Bubble Size**: Number of unique types of cheese  
# - **Color**: Average moisture percent  

# We use this chart to assess temperature's effect on the quantity and variety of cheese produced.

In [13]:
# Creating the first visualization: Scatter Plot

fig1 = px.scatter(
    combined_df,
    x='Avg Celsius',
    y='Total_Cheese_Produced',
    size='Num_Cheese_Types',
    color='Moisture Percent',
    hover_name='Province',
    title='Avg Temperature vs. Total Cheese Produced\n(Bubble size = # of Types, Color = Moisture %)',
    labels={
        'Avg Celsius': 'Average Temperature (°C)',
        'Total_Cheese_Produced': 'Total Cheese Produced',
        'Num_Cheese_Types': '# of Cheese Types',
        'Moisture Percent': 'Avg Moisture %'
    }
)
fig1.show()

In [None]:
## Visualization 2: Cheese Category Distribution by Province

#This stacked bar chart analyzes the distribution of the different cheese categories across the Provinces.  
#The provinces are sorted by the total cheese production volume.  

#This allows for assessing if any provinces are specialized in certain cheese types or produce a wider variety.

In [14]:
# Creating the second visualization: Stacked Bar Graph
type_cols = [c for c in combined_df.columns 
             if c not in [
                 'Province','Moisture Percent','Organic',
                 'Num_Cheese_Types','Total_Cheese_Produced',
                 'High_Celsius','High_Farenheit','Low_Celsius',
                 'Low_Farenheit','Avg Celsius','Avg Farenheit'
             ]]

fig2 = px.bar(
    combined_df,
    x='Province',
    y=type_cols,
    title='Cheese Category Composition by Province',
    labels={'value':'Count of Cheeses', 'variable':'Cheese Category'},
    barmode='stack'
)
fig2.update_layout(
    xaxis={'categoryorder':'array',
           'categoryarray': combined_df
               .sort_values('Total_Cheese_Produced', ascending=False)
               ['Province'].tolist()}
)
fig2.show()

In [None]:
# Key Inferences: 

# 1. Temperate Provinces Achieve High Output and Diversity**  
#    The scatter plot indicates that provinces with **moderate average temperatures (4–7°C)**, like **Quebec** and **Ontario**, not only exhibit both **high total cheese production** and **a wide range and diversity of cheese types.** They are not just producing volume, they are also producing a mash-up of all the types such as soft cheese, semi-soft cheese, and firm cheese, etc.   
#    ➤ *This indicates, or at least implies, that temperate climates can provide the most complete year round building blocks for the dairy farm/ operation with a value add of cheese as the final concept.*  

# 2. Cheese Diversity Grows with Total Output**  
#    The bubble chart indicates that the provinces producing more cheese will offer a **greater deal of a diversity of unique categories.** The example of Quebec producing totals of over 700 total entries, with this aggregate number higher than the next two provinces combined in both uniqueness and completely organic paradigm.  
#    ➤ *This suggests that once you have built the base production systems as a province, you could pivot or linear expand into other artisanal or specialty niches from a foundational production infrastructure.*  

# 3. Clear Regional Strengths in Cheese Profiles**  
#    The stacked bar chart can show the **unique cheese profiles by province.** Quebec shows a charm of balanced production of soft, semi-soft, and firm cheese. Ontario is building strength especially in firmness and fresh cheese. One outlier would be British Columbia who has a strong showing of soft cheese.  
#    ➤ *This heterogeneity indicates that demands of climate, customers, and tradition defined unique cheese selections by region.*

# 3. Clear Regional Strengths in Cheese Profiles**  
#    The stacked bar chart illustrates **patterns of cheese profile uniqueness by province**. Quebec produces a balanced proportion of soft, semi-soft, and hard. Ontario shows strength in firm and fresh cheeses, while BC also capitalizes on having one of the highest amounts of soft cheeses.  
#    ➤ *This variation across regions indicates that climate, consumer environmental demand, and traditions act in synergy to develop cheese specialization.*

# 4. Moisture Content Is Style-Driven Not Climate-Driven**  
#    While we are dealing with significantly different climates there is not a direct relationship between the average monthly temperature and **cheese moisture percent**. For example, the average moisture percent in New Brunswick was quite similar to the moisture percent in Ontario which was at extreme opposite environments.  
#    ➤ *This demonstrates that moisture is influenced more by cheese-making process (i.e., fresh vs aged cheese) than ambient weather conditions.*

# 5. Quebec Leads in Organic Production — A model for Scaled Artisanal Output**  
#    Quebec claimed **over 70% of all organic cheeses**, proving that even province with strong market share of cheese can develop environmental sustainable practices at scale. This Leadership reinforces the province's overall market share and overwhelming variety of cheeses.  
#    ➤ *This implies the potentially critical role of public supportive regional political initiatives, consumer environmental cognizance, and the establishment of market places to develop organic ecological systems.*

### Summary of Key Relationships

# | Insight                            | Evidence from Visualizations                                |
# |------------------------------------|--------------------------------------------------------------|
# | Moderate temps → high output       | Bubble chart shows Quebec/ON with high output at 4–7°C       |
# | High output → more categories      | Bubble size and stacked bars correlate production & variety  |
# | Cheese style ≠ climate             | Moisture levels consistent across diverse climates           |
# | Quebec = organic & diverse leader | Stacked bars + category counts confirm breadth + organic     |

In [18]:
# Exporting the final csv for the cheese + weather stats
combined_df.to_csv("cleaned_cheese_weather_data.csv", index=False)