In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Define columns and read avocado data
avocado_columns = [
    'code', 'lc', 'product_name_en', 'quantity', 'serving_size', 'packaging_tags',
    'brands', 'brands_tags', 'categories_tags', 'labels_tags', 'countries',
    'countries_tags', 'origins', 'origins_tags'
]
avocado_df = pd.read_csv('data/avocado.csv', sep='\t', usecols=avocado_columns)

# Load avocado category tags
with open('data/relevant_avocado_categories.txt', 'r') as file:
    avocado_tags = [line.strip() for line in file]

# Filter avocado data based on categories and focus on the UK
avocado_df_filtered = avocado_df[avocado_df['categories_tags'].apply(lambda x: any(tag in str(x).split(',') for tag in avocado_tags)) & (avocado_df['countries'].str.contains("United Kingdom", na=False))].copy()

# Clean country names and correct specific formatting issues
avocado_df_filtered['origins_tags'] = avocado_df_filtered['origins_tags'].str.replace('[^A-Za-z ]', '', regex=True).str.replace('enperu', 'Peru')

# Determine the most common avocado origin in the UK
top_avocado_origin = avocado_df_filtered['origins_tags'].value_counts().idxmax()
print(f"The most common country of origin for avocados in the UK is: {top_avocado_origin}")

The most common country of origin for avocados in the UK is: Peru


In [4]:
# Load olive oil CSV file with specified columns and handle mixed types warning by specifying dtype
oliveOil_columns = [
    'code', 'lc', 'product_name_en', 'quantity', 'serving_size', 'packaging_tags',
    'brands', 'brands_tags', 'categories_tags', 'labels_tags', 'countries',
    'countries_tags', 'origins', 'origins_tags'
]
oliveOil_df = pd.read_csv('data/olive_oil.csv', sep='\t', usecols=oliveOil_columns, dtype={'code': str})

# Load category tags for olive oil with the correct encoding
with open('data/relevant_olive_oil_categories.txt', 'r', encoding='utf-8') as file:
    oliveOil_tags = [line.strip() for line in file]
    
# Ensure data is filtered for the UK and focus on non-empty category tags
oliveOil_df = oliveOil_df.dropna(subset=['categories_tags'])

# Apply filters for relevant categories and UK-specific data
oliveOil_df_filtered = oliveOil_df[
    oliveOil_df['categories_tags'].apply(lambda x: any(tag in str(x).split(',') for tag in oliveOil_tags)) &
    (oliveOil_df['countries'].str.contains("United Kingdom", na=False))
].copy()

# Clean country names in 'origins_tags'
oliveOil_df_filtered['origins_tags'] = oliveOil_df_filtered['origins_tags'].str.replace('[^A-Za-z ]', '', regex=True)

# Determine the most common olive oil origin in the UK and handle the 'en' prefix if present
top_olive_oil_origin = oliveOil_df_filtered['origins_tags'].value_counts().idxmax()
if top_olive_oil_origin.startswith('en'):
    top_olive_oil_origin = top_olive_oil_origin[2:].capitalize()  # Remove 'en' and capitalize

print(f"The most common country of origin for olive oil in the UK is: {top_olive_oil_origin}")


The most common country of origin for olive oil in the UK is: Greece


In [5]:
# Define columns for loading sourdough data
sourdough_columns = [
    'code', 'lc', 'product_name_en', 'quantity', 'serving_size', 'packaging_tags',
    'brands', 'brands_tags', 'categories_tags', 'labels_tags', 'countries',
    'countries_tags', 'origins', 'origins_tags'
]

# Load the sourdough data from CSV
sourdough_df = pd.read_csv('data/sourdough.csv', sep='\t', usecols=sourdough_columns, dtype={'code': str})

# Load category tags for sourdough with correct encoding
with open('data/relevant_sourdough_categories.txt', 'r', encoding='utf-8') as file:
    sourdough_tags = [line.strip() for line in file]

# Drop rows with NaN in 'categories_tags' to ensure we're working with complete data
sourdough_df = sourdough_df.dropna(subset=['categories_tags'])

# Filter data based on relevant categories and focus specifically on the UK
sourdough_df_filtered = sourdough_df[
    sourdough_df['categories_tags'].apply(lambda x: any(tag in str(x).split(',') for tag in sourdough_tags)) &
    (sourdough_df['countries'].str.contains("United Kingdom", na=False))
].copy()

# Clean country names in 'origins_tags' and handle specific replacements
sourdough_df_filtered['origins_tags'] = sourdough_df_filtered['origins_tags'].str.replace('[^A-Za-z ]', '', regex=True)
sourdough_df_filtered['origins_tags'] = sourdough_df_filtered['origins_tags'].str.replace('unitedkingdom', 'United Kingdom')

# Determine the most common sourdough origin in the UK
top_sourdough_origin = sourdough_df_filtered['origins_tags'].value_counts().idxmax()
if top_sourdough_origin.startswith('en'):
    top_sourdough_origin = top_sourdough_origin[2:].capitalize()
print(f"The most common country of origin for sourdough in the UK is: {top_sourdough_origin}")


The most common country of origin for sourdough in the UK is: United kingdom
