In [1]:
import polars as pl
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as pyo
from datetime import datetime
import os

def load_master_data_safely():
    """Safely load master data and check columns"""
    print("📊 Loading master dataset safely...")
    
    try:
        # Try loading with coordinates first
        if os.path.exists("data/master_transactions_with_coords.parquet"):
            master_data = pl.read_parquet("data/master_transactions_with_coords.parquet")
            print(f"✅ Master data with coordinates loaded: {master_data.shape}")
        else:
            # Load original master data
            master_data = pl.read_parquet("data/master_transactions.parquet")
            print(f"✅ Master data loaded: {master_data.shape}")
            
            # Add coordinates if needed
            print("🗺️ Adding coordinates...")
            stores_original = pd.read_csv(r'C:\Users\Alan\Downloads\BDM\BDM_raw_data\stores.csv')
            stores_coords = pl.from_pandas(stores_original).select([
                'Store ID', 'Latitude', 'Longitude'
            ])
            
            master_data = master_data.join(stores_coords, on='Store ID', how='left')
            master_data.write_parquet("data/master_transactions_with_coords.parquet")
            print("✅ Coordinates added and saved!")
        
        # Verify key columns exist
        required_columns = ['Line_Total_USD', 'Unit Price', 'Quantity', 'Country', 'Transaction Type']
        missing_columns = [col for col in required_columns if col not in master_data.columns]
        
        if missing_columns:
            print(f"❌ Missing columns: {missing_columns}")
            return None
        
        print("✅ All required columns found!")
        return master_data
        
    except Exception as e:
        print(f"❌ Error loading master data: {e}")
        return None


In [2]:
master_data = load_master_data_safely()

📊 Loading master dataset safely...
✅ Master data with coordinates loaded: (6416827, 45)
✅ All required columns found!


In [16]:
import polars as pl

# Assuming master_data is a Polars DataFrame
city_counts_df = master_data.groupby("City").count().sort("count", descending=True)

# Save to CSV
city_counts_df.write_csv("data/city_counts.csv")


AttributeError: 'DataFrame' object has no attribute 'groupby'

In [35]:
import polars as pl

# Confirm your frame is a normal DataFrame, not LazyFrame
print(type(master_data))  # should show: polars.dataframe.frame.DataFrame

# Now perform value counts
city_counts_df = (
    df
    .group_by("City")  # not `groupby`, correct is `group_by`
    .agg(pl.count().alias("Count"))
    .sort("Count", descending=True)
)

# Save to CSV
city_counts_df.write_csv("data/city_counts2.csv")


<class 'polars.dataframe.frame.DataFrame'>



`pl.count()` is deprecated. Please use `pl.len()` instead.
(Deprecated in version 0.20.5)



In [None]:
import polars as pl

city_translation_map = {
}

country_translation_map = {
    '中国': 'China',
    'España': 'Spain',
    'Portugal': 'Portugal',
    'Deutschland': 'Germany',
    'France': 'France',
    'United Kingdom': 'United Kingdom',
    'United States': 'United States'
}

def clean_and_translate_data(master_data, city_translation_map, country_translation_map):
    """
    Clean and translate city and country names into English using Polars DataFrame.
    Compatible with older Polars versions.
    """
    master_data = master_data.with_columns([
        pl.col("City").replace(city_translation_map).alias("City"),
        pl.col("Country").replace(country_translation_map).alias("Country")
    ])
    return master_data



In [29]:
# city_translation_map = {"München": "Munich", "Köln": "Cologne"}
# country_translation_map = {"Deutschland": "Germany"}

df = clean_and_translate_data(master_data, city_translation_map, country_translation_map)


In [33]:
df["City"].value_counts()

City,count
str,u32
"""Motherwell""",2065
"""Ronda""",276
"""Alboraya""",1724
"""Huyton""",2113
"""Skokie""",5660
…,…
"""Kornwestheim""",417
"""Rubí""",2113
"""Mettmann""",231
"""Peoria (AZ)""",4874
