# Chema Gálvez 
All save are commented so vizualisations can be seen with just running the notebook

In [None]:
import altair as alt
import pandas as pd
import polars as pl

In [79]:
# Load datasets
df_solar_manufacture_share_countries = pl.read_excel("../data/pv_manufacture_share_2023.xlsx")
df_china_solar_exports = pl.read_csv("../data/mart_solar_exports_full_release_monthly.csv")
df_cofi = pl.read_excel("../data/COFI_Database_Version_October_2023.xlsx")

Could not determine dtype for column 39, falling back to string
Could not determine dtype for column 40, falling back to string
Could not determine dtype for column 42, falling back to string


## KPI donut charts

In [None]:
def make_kpi_donut(country: str, color="#4C72B0"):
    '''
    Plot a donut chart of the given country with the percentage inside the donut
    and country label on top
    '''

    share = df_solar_manufacture_share_countries.filter(pl.col("country") == country)["share"][0]

    df = pd.DataFrame({
        "category": ["share", "remainder"],
        "value": [share, 100 - share],
        "color": [color, "lightgray"]
    })

    donut = alt.Chart(df).mark_arc(
        innerRadius=45,
        outerRadius=70
    ).encode(
        theta="value:Q",
        color=alt.Color("color:N", scale=None)
    )

    center_text = alt.Chart(pd.DataFrame({"label": [f"{share:.1f}%"]})).mark_text(
        font="Helvetica Neue",
        align="center",
        baseline="middle",
        fontSize=24,
        fontWeight="bold",
        color="black"
    ).encode(text="label:N")

    title_label = alt.Chart(pd.DataFrame({"label": [country]})).mark_text(
        font="Helvetica Neue",
        fontSize=20,
        fontWeight=600,
        color="#111",
        align="center"
    ).encode(
        text="label:N",
        x=alt.value(160 / 2),
        y=alt.value(-5)
    )

    return alt.layer(
        donut,
        center_text,
        title_label
    ).properties(
        width=160,
        height=160
    ).configure_view(
        stroke=None,
        clip=False
    )


In [90]:
make_kpi_donut("China", "#FFCF63").save("article-images/China.svg")
make_kpi_donut("China", "#FFCF63")

In [91]:
make_kpi_donut("Vietnam", "#046C84")#.save("article-images/Vietnam.svg")
make_kpi_donut("Vietnam", "#046C84")

In [92]:
make_kpi_donut("India", "#F46C34").save("article-images/India.svg")
make_kpi_donut("India", "#F46C34")

In [93]:
make_kpi_donut("Thailand", "#3C045B").save("article-images/Thailand.svg")
make_kpi_donut("Thailand", "#3C045B")

In [94]:
make_kpi_donut("United States", "#0D73B1").save("article-images/United States.svg")
make_kpi_donut("United States", "#0D73B1")

In [95]:
make_kpi_donut("Malaysia", "#042C5C").save("article-images/Malaysia.svg")
make_kpi_donut("Malaysia", "#042C5C")

In [89]:
make_kpi_donut("Others", "black").save("article-images/Others.svg")
make_kpi_donut("Others", "black")

## Area chart - China Solar Exports Colored by Tech

In [None]:
def china_solar_exports_df():
    '''
    Clean and filter the China solar exports dataset.

    - Converts the 'Date' column to a proper date type.
    - Excludes aggregated "Other" regional categories.
    - Excludes data after August 2025 to avoid partial/uncertain recent effects.
    '''
    return (
        df_china_solar_exports.with_columns(
            pl.col("Date").str.to_date() # Convert Date column to date type
        )
        .filter(
            pl.col("Area").is_in([
                "Other Oceania", 
                "Other Africa", 
                "Other Latin America"
            ]).not_()
        )
        .filter(
            # Exclude data after August 2025 due to potential tariff-related distortions
            pl.col("Date") <= pl.date(2025, 8, 31)
        )
    )

In [None]:
def china_solar_exports_regions_df(region: str):
    '''
    Return a filtered DataFrame of Chinese solar exports by region level.

    Parameters:
    region : str
        - "world": returns only the global total.
        - "regions": returns all regions except the world total.
        - any other region name: returns only that region.

    Returns:
        Filtered export data at the region level.
    '''
    df = china_solar_exports_df().filter(pl.col("Area type") == "Region")

    if region == "world":
        return df.filter(pl.col("Area") == "World")
    elif region == "regions":
          return df.filter(pl.col("Area") != "World")
    else:
        return df.filter(pl.col("Area") == region)
    
def china_solar_exports_monthly(region: str):
    '''
    Aggregate Chinese solar export data by area, date, and commodity type.

    Parameters
    region : str
        Region filter passed to china_solar_exports_regions_df().

    Returns:
        Monthly export totals by commodity type.
    '''
    df = china_solar_exports_regions_df(region)

    result = (
        df.group_by(["Area", "Date", "Commodity type"]) 
        .agg(pl.col("Capacity (MW)").sum().alias("Total Capacity (MW)"))
        .rename({"Area": "Region"})
        .sort(["Region", "Date", "Commodity type"])
    )

    return result

def china_solar_exports_monthly_by_region(region: str):
    '''
    Aggregate monthly solar export capacity across all commodity types.

    Parameters:
    region : str
        Region filter passed to china_solar_exports_monthly().

    Returns:
        Total monthly export capacity per region.
    '''
    
    # Get detailed data (with commodity types)
    detailed_df = china_solar_exports_monthly(region)
    
    # Add a final aggregation to sum up the commodity types
    region_agg_df = (
        detailed_df.group_by(["Region", "Date"])
        .agg(pl.col("Total Capacity (MW)").sum()) # Sums the commodities
        .sort(["Region", "Date"])
    )
    return region_agg_df

In [None]:
def world_exports_stacked_area():
    '''
    Create a stacked area chart of total global Chinese solar exports over time,
    separated by technology type (panels, cells, wafers).

    Returns
        A stacked area visualization of monthly export capacity.
    '''
    df_world_details = china_solar_exports_monthly("world")

    domain_ = ["Panels", "Cells", "Wafers"] 
    colors = ['#FBA414', '#12AA9C', '#5A1216']

    stacked_area_chart = (
        alt.Chart(df_world_details)
        .mark_area()
        .encode(
            x=alt.X(
                "Date:T",
                title="Year",
                axis=alt.Axis(format="%Y", tickCount="year")
            ),
            
            y=alt.Y(
                "Total Capacity (MW):Q", 
                title="Total Capacity (MW)", 
                stack="zero"
            ),
            
            color=alt.Color(
                "Commodity type:N", 
                title="Technology",
                scale=alt.Scale(domain=domain_, range=colors)
            ),
            
            tooltip=[
                alt.Tooltip("Date:T", title="Date", format="%Y-%m"),
                alt.Tooltip("Commodity type:N", title="Technology"),
                alt.Tooltip("Total Capacity (MW):Q", title="Monthly Capacity (MW)", format=",")
            ]
        )
        .properties(
            title="China’s Global Solar Technology Export Volumes Over Time",
            width=750,
            height=400
        )
        .configure_title(
        fontSize=17,
        font="Helvetica", 
        anchor="middle",
        )
        .interactive()
    )

    return stacked_area_chart

world_exports_stacked_area().save("article-images/china_world_exports_area.svg")
world_exports_stacked_area()

## Line Chart Exports by Region

In [195]:
# ChatGPT code after prompt: This is my chart but Latin America and Africa are
# on top of each other, help me create space between them [my original function]
def plot_regional_exports():
    """
    Plot regional trends in China's monthly solar export capacity.

    This chart displays regions as separate lines and places labels at the
    most recent data point. Label spacing is adjusted in data units to
    prevent overlapping, especially for regions with similar export levels.

    Returns:
        Line chart with endpoint labels positioned to minimize overlap.
    """

    regions_to_remove = ["EU", "G20", "G7", 
                         "OECD", "ASEAN", "Middle East", "Oceania"]

    domain_list = [
        "Africa", "Asia", "Europe",
        "Latin America and Caribbean",
        "North America"
    ]

    color_range = [
        "#D0957E",   # Africa
        "#F7D881",   # Asia
        "#006796",   # Europe
        "#FF8F00",   # Latin America & Caribbean
        "#6ABA92",   # North America
    ]

    df_regions = (
        china_solar_exports_monthly_by_region("regions")
        .filter(~pl.col("Region").is_in(regions_to_remove))
    )

    # Identify the final (most recent) data point for each region
    last_points = (
        df_regions
        .with_columns(
            pl.col("Date").rank(method="dense", descending=True).over("Region").alias("rank")
        )
        .filter(pl.col("rank") == 1)
        .select(["Region", "Total Capacity (MW)"])
    )

    # Merge offsets and constrain minimum label height
    n = last_points.height
    spread_fraction = 0.03 
    value_range = last_points["Total Capacity (MW)"].max() - last_points["Total Capacity (MW)"].min()

    offsets = (
        last_points
        .sort("Total Capacity (MW)")
        .with_row_count("i")
        .with_columns(
            ((pl.col("i") - (n - 1) / 2) * (value_range * spread_fraction)).alias("data_offset")
        )
        .select(["Region", "data_offset"])
    )

    df_regions = (
        df_regions
        .join(offsets, on="Region", how="left")
        .with_columns([
            (pl.col("Total Capacity (MW)") + pl.col("data_offset")).alias("raw_label_y")
        ])
        .with_columns([
            pl.when(pl.col("raw_label_y") < pl.col("Total Capacity (MW)") * 0.35)
            .then(pl.col("Total Capacity (MW)") * 0.35)
            .otherwise(pl.col("raw_label_y"))
            .alias("label_y")
        ])
    )


    pdf = df_regions.to_pandas()

    # Base chart
    base = alt.Chart(pdf).encode(
        x=alt.X("Date:T", title="Year", axis=alt.Axis(format="%Y", tickCount="year")),
        y=alt.Y("Total Capacity (MW):Q", title="Total Capacity (MW)"),
        color=alt.Color("Region:N", 
                        scale=alt.Scale(domain=domain_list, 
                                        range=color_range), legend=None)
    )

    lines = base.mark_line()
    points = (
        base.transform_window(rank='rank()', 
                              sort=[alt.SortField('Date', order='descending')])
        .transform_filter('datum.rank == 1')
        .mark_point(filled=True, size=55)
    )
    labels = (
        base.transform_window(rank='rank()', 
                              sort=[alt.SortField('Date', order='descending')])
        .transform_filter('datum.rank == 1')
        .mark_text(align="left", dx=5, fontSize=13)
        .encode(
            x="Date:T",
            y="label_y:Q",
            text="Region:N",
            color=alt.value("black")
        )
    )

    return (
        (lines + points + labels)
        .properties(width=750, height=420,
                    title="Regional Demand for Chinese Solar Technology")
        .configure_axis(grid=False)
        .configure_title(
        fontSize=20,
        font="Helvetica", 
        anchor="middle",
        )
    )


plot_regional_exports().save("article-images/exports_by_region_line.svg")
plot_regional_exports()

  .with_row_count("i")
  .with_row_count("i")


## Bar Chart Top 15 Countries Who Import

In [192]:
def china_solar_exports_countries_df():
    '''
    Return export data filtered to country/economy level only.

    Returns:
        Solar export data for individual countries.
    '''
    return china_solar_exports_df().filter(
        pl.col("Area type") == "Country or economy"
    )

def china_solar_exports_countries_totals():
    '''
    Aggregate monthly solar export capacity by country and technology type.

    Returns:
        Total export capacity per country and commodity type.
    '''
    return (
        china_solar_exports_countries_df()
        .group_by(["Area", "Commodity type"])
        .agg(pl.col("Capacity (MW)").sum().alias("Total Capacity (MW)"))
    )

def top15():
    '''
    Identify the top fifteen countries by total solar import capacity.

    Returns:
        Table containing the top 15 country names.
    '''
    return (
        china_solar_exports_countries_totals()
        .group_by("Area")
        .agg(pl.col("Total Capacity (MW)").sum().alias("Country Total"))
        .sort("Country Total", descending=True)
        .head(15)
        .select("Area")
    )

def df_top15():
    '''
    Return dataset containing only the top fifteen importers with commodity detail.

    Returns:
        Data for charting the top 15 countries with breakdown by technology.
    '''
    return (
    china_solar_exports_countries_totals()
    .join(top15(), on="Area", how="inner")
    .to_pandas()
    )

def plot_bars_top15_importers():
    '''
    Create a stacked bar chart of the top 15 importers of Chinese solar technology
    by total capacity and commodity type.

    Returns:
        A stacked bar visualization of leading importing countries.
    '''
    domain_ = ["Panels", "Cells", "Wafers"] 
    range_ = ['#FBA414', '#12AA9C', '#5A1216']
    
    bar_chart_top15 = (
        alt.Chart(df_top15())
        .mark_bar()
        .encode(
        x=alt.X("Area:N", sort="-y", title=None),
        y=alt.Y("Total Capacity (MW):Q", title="Total Capacity (MW)"),
        color=alt.Color("Commodity type:N", 
                        title="Technology", 
                        scale=alt.Scale(domain=domain_, range=range_)),
        tooltip=[
            alt.Tooltip("Area:N", title="Country"),
            alt.Tooltip("Commodity type:N", title="Technology"),
            alt.Tooltip("Total Capacity (MW):Q", title="Capacity (MW)", format=",")
        ]
    )
    .properties(width=700, height=400, title="Who Buys China’s Solar Technology?")
    .configure_title(
        fontSize=17,
        font="Helvetica", 
        anchor="middle",
        )
)

    return bar_chart_top15

plot_bars_top15_importers().save("article-images/bar_topimporters.svg")
plot_bars_top15_importers()

## Lollipop Investments

In [None]:
def cofi_df():
    '''
    Return a cleaned subset of the COFI dataset containing only solar projects.

    Filters:
        - Keeps only "solar pv" and "solar csp" as primary fuels.
        - Ensures primary fuel values are lowercase.
        - Casts investment amounts to floats.
    
    Returns:
        COFI plant-level solar project data with consistent field types.
    '''
    techs = ["solar csp", "solar pv"]
    return (
        df_cofi.filter(pl.col("primary_fuel").str.to_lowercase().is_in(techs))
        .select([
            "country",
            "country_iso3c",
            "region",
            "location_id.x",
            "installed_capacity",
            pl.col("total_investment_amount").cast(pl.Float64).alias("total_investment_amount"),
            pl.col("primary_fuel").str.to_lowercase().alias("primary_fuel")
        ])
    )


In [None]:
def cofi_plant_level():
    '''
    Aggregate financing at the plant level to avoid double counting.

    Some plants appear multiple times in the dataset if they have multiple
    financing rounds. This function groups by the plant identifier and keeps
    only the maximum financing value per plant.

    Returns:
        Plant-level investment totals.
    '''
    return (
        cofi_df()
        .group_by(["country", "country_iso3c", "region", "location_id.x"])
        .agg(pl.col("total_investment_amount").max().alias("plant_total_investment"))
    )


def cofi_country_totals():
    '''
    Sum plant-level investments to produce country-level totals.

    Returns:
        Total investment per country.
    '''
    return (
        cofi_plant_level()
        .group_by(["country", "country_iso3c", "region"])
        .agg(pl.col("plant_total_investment").sum().alias("total_investment_musd"))
        .sort("total_investment_musd", descending=True)
    )


def cofi_country_top_n(n=15):
    '''
    Return the top n recipient countries of Chinese-funded solar projects.

    Parameters:
    n : int, default = 15
        Number of countries to include.

    Returns:
        Country-level totals for the top *n* recipients.
    '''
    return (
        cofi_country_totals()
        .head(n)
        .to_pandas()
    )


def cofi_plant_details():
    '''
    Return a plant-level dataset with investment, capacity, and fuel attributes.

    Used for scatter/bubble map visualizations.

    Returns:
        Plant-level records with deduped attributes.
    '''
    plant_attributes = (
        cofi_df()
        .select(["location_id.x", "installed_capacity", "primary_fuel"])
        .unique(subset=["location_id.x"], keep="first")
    )

    return (
        cofi_plant_level()
        .join(plant_attributes, on="location_id.x", how="left")
        .to_pandas()
    )


def plot_top_investment_countries(n=15):
    '''
    Create a lollipop chart showing the top *n* countries receiving
    Chinese overseas solar investment.

    Parameters:
    n : int, default = 15
        Number of countries to include.

    Returns:
        Lollipop chart where circle color indicates region grouping.
    '''
    domain_list = [
        "Africa", "Asia", "Europe","Americas"
    ]

    color_range = [
        "#B55A2A",   # Africa
        "#E8B200",   # Asia
        "#006796",   # Europe
        "#FF8F00",   # Americas
    ]

    df_top = cofi_country_top_n(n)

    base = alt.Chart(df_top).encode(
        y=alt.Y('country:N', sort='-x', title=None)
    )

    lines = base.mark_rule().encode(
        x=alt.X('total_investment_musd:Q', title='Total Investment (USD Millions)')
    )

    points = base.mark_circle(size=90).encode(
        x='total_investment_musd:Q',
        #color=alt.Color('region:N', legend=alt.Legend(title='Region')),
        color=alt.Color(
            "region:N",
            scale=alt.Scale(domain=domain_list, range=color_range),
            legend=alt.Legend(title="Region")
        ),
        tooltip=[
            'country:N',
            'region:N',
            alt.Tooltip('total_investment_musd:Q', format=',.1f', title='Investment (M USD)')
        ]
    )

    return (
        (lines + points)
        .properties(width=650, height=22 * len(df_top), title="China’s Major Foreign Solar Investment Destinations")
        .configure_view(stroke=None)
        .configure_title(
        fontSize=17,
        font="Helvetica", 
        anchor="middle",
        )
    )

In [190]:
plot_top_investment_countries().save("article-images/china_investments_lollipop.svg")
plot_top_investment_countries()

In [None]:
def cofi_region_totals():
    '''
    Aggregate investment and installed capacity totals at the region level.

    Uses plant-level deduplication to avoid double counting multi-phase
    or multi-investor projects.

    Returns:
        Regional totals of investment (USD millions) and installed capacity (MW).
    '''
    plants = cofi_plant_details()
    return (
        pl.DataFrame(plants)
        .group_by("region")
        .agg([
            pl.col("plant_total_investment").sum().alias("total_investment_musd"),
            pl.col("installed_capacity").sum().alias("total_capacity_mw"),
        ])
        .sort("total_investment_musd", descending=True)
        .to_pandas()
    )


def plot_region_scatter():
    '''
    Create a scatter plot comparing total investment and total installed capacity
    of Chinese-funded solar projects across world regions.

    Regions are differentiated by color, and bubble size is constant for
    clean comparison without over-emphasizing larger regions.

    Returns:
        A region-level scatter plot (investment vs. installed capacity).
    '''
    domain_list = ["Africa", "Asia", "Europe", "Americas"]
    color_range = ["#B55A2A", "#E8B200", "#006796", "#FF8F00"]

    df_regions = cofi_region_totals()

    scatter = (
        alt.Chart(df_regions)
        .mark_circle(size=300)
        .encode(
            x=alt.X("total_investment_musd:Q", title="Total Investment (M USD)"),
            y=alt.Y("total_capacity_mw:Q", title="Installed Capacity (MW)"),
            color=alt.Color(
                "region:N",
                scale=alt.Scale(domain=domain_list, range=color_range),
                legend=alt.Legend(title="Region")
            ),
            tooltip=[
                alt.Tooltip("region:N", title="Region"),
                alt.Tooltip("total_investment_musd:Q", title="Total Investment (M USD)", format=","),
                alt.Tooltip("total_capacity_mw:Q", title="Installed Capacity (MW)", format=",")
            ]
        )
        .properties(
            width=650,
            height=450,
            title="Which Regions Receive the Most Chinese Solar Investment and Capacity?"
        )
        .configure_view(stroke=None)
        .configure_title(
        fontSize=15,
        font="Helvetica", 
        anchor="middle",
        )
    )

    return scatter


In [185]:
chart = plot_region_scatter()

chart.save(
    "article-images/scatter_invest_regions.html",
    format="html",
    embed_options={
        "actions": False
    }
)

plot_region_scatter()