# CDP Data 2022: Variable Extraction

## Overview  
This module extracts specific performance variables from the cleaned CDP dataset to support the greenwashing risk assessment framework. The extraction focuses on self-reported emission data and target information from standardized CDP disclosure sections.

## Key Variables Extracted
- **Reporting periods** (C0.2): Temporal alignment for data consistency
- **Target types** (C4.1): Absolute vs. intensity target classification  
- **Scope 1 & 2 emissions** (C6.1, C6.3): Core emission data for intensity calculations
- **Emission intensities** (C6.10): Revenue-based emission metrics when available
- **Target details** (C4.1a, C4.1b): Science-based targets and reduction commitments

## Final Usage Note
While this module extracts comprehensive emission and target data, the final analysis uses **only Scope 1 and Scope 2 emissions per revenue** for standardized comparison. Other extracted variables (Scope 3, absolute targets, detailed intensity metrics) were initially considered but excluded due to inconsistent calculation methodologies across companies.

## Output
Structured dataframes containing standardized emission intensity metrics and target information for integration with the performance assessment framework.

In [None]:
import pandas as pd

In [None]:
# Load CDP Excel file and specific sheet
CDP = pd.read_excel("data/CDP/CDP_2023_short_color_relevant_rows.xlsx", sheet_name=None)  # Load all sheets

CDP

In [None]:
# Make cdp_base_df an empty DataFrame
cdp_base_df = pd.DataFrame()

# Extract the relevant sheet
c02_df = CDP["C0.2"]

# Select desired columns by index (0 = Organization, 3 = start date, 4 = end date)
cdp_base_df = c02_df.iloc[:, [0, 3, 4]].copy()

# Optionally rename for clarity
cdp_base_df.columns = ["Organization", "Reporting Start Date", "Reporting End Date"]


In [None]:
# Extract the C4.1 sheet
c41_df = CDP["C4.1"]

# Get organization and target type columns
c41_targets = c41_df.iloc[:, [0, 3]].copy()  # Column 0 = Organization, Column 3 = Target Type
c41_targets.columns = ["Organization", "Target Type"]

# Create Yes/No columns for Absolute and Intensity targets
c41_targets["Absolute target"] = c41_targets["Target Type"].str.contains("Absolute", case=False, na=False)
c41_targets["Intensity target"] = c41_targets["Target Type"].str.contains("Intensity", case=False, na=False)
c41_targets["Absolute target"] = c41_targets["Absolute target"].map({True: "Yes", False: "No"})
c41_targets["Intensity target"] = c41_targets["Intensity target"].map({True: "Yes", False: "No"})

# Keep only relevant columns
c41_targets = c41_targets[["Organization", "Absolute target", "Intensity target"]]

# Merge into base CDP dataframe
cdp_base_df = cdp_base_df.merge(c41_targets, on="Organization", how="left")




In [None]:
# Extract the C6.1 sheet
c61_df = CDP["C6.1"]

# Get relevant columns: Organization (0), RowName (2), Scope 1 emissions (3)
scope1 = c61_df.iloc[:, [0, 2, 3]].copy()
scope1.columns = ["Organization", "RowName", "Scope 1 Emissions"]

# Filter only reporting year rows
scope1_reporting = scope1[scope1["RowName"].str.lower().str.contains("reporting year", na=False)].copy()

# Convert emissions to numeric
scope1_reporting["Scope 1 Emissions"] = pd.to_numeric(scope1_reporting["Scope 1 Emissions"], errors="coerce")

# Keep only required columns
scope1_reporting = scope1_reporting[["Organization", "Scope 1 Emissions"]]
scope1_reporting.rename(columns={"Scope 1 Emissions": "Gross Global Sc1 (ton CO2e)"}, inplace=True)

# Merge with base dataframe
cdp_base_df = cdp_base_df.merge(scope1_reporting, on="Organization", how="left")



In [None]:
# Extract the C7.1 sheet
c71a_df = CDP["C7.1a"]

# Filter for CO2 rows only (greenhouse gas = CO2)
co2_rows = c71a_df[c71a_df.iloc[:, 3].astype(str).str.strip().str.upper() == "CO2"]

# Select relevant columns
co2_emissions = co2_rows.iloc[:, [0, 4]].copy()  # Organization and Scope 1 CO2 emissions
co2_emissions.columns = ["Organization", "Sc1 CO2 (ton CO2)"]

# Merge into main dataframe
cdp_base_df = cdp_base_df.merge(co2_emissions, on="Organization", how="left")


In [None]:
# Extract the relevant sheet
ceu71b_df = CDP["C-EU7.1b"]

# Select relevant columns: Organization and the emissions column
elut_sc1 = ceu71b_df.iloc[:, [0, 3]].copy()
elut_sc1.columns = ["Organization", "Sc1 El. Ut.-specific (ton CO2)"]

# Group by organization and sum the emissions
elut_summary = elut_sc1.groupby("Organization", as_index=False).sum()

# Merge into the main dataframe
cdp_base_df = cdp_base_df.merge(elut_summary, on="Organization", how="left")



In [None]:
# Extract the C6.3 sheet
c63_df = CDP["C6.3"]

# Get relevant columns: Organization (0), RowName (2), Scope 2 emissions (3)
scope2 = c63_df.iloc[:, [0, 2, 3]].copy()
scope2.columns = ["Organization", "RowName", "Scope 2 Emissions"]

# Filter only reporting year rows
scope2_reporting = scope2[scope2["RowName"].str.lower().str.contains("reporting year", na=False)].copy()

# Convert to numeric
scope2_reporting["Scope 2 Emissions"] = pd.to_numeric(scope2_reporting["Scope 2 Emissions"], errors="coerce")

# Keep only needed columns
scope2_reporting = scope2_reporting[["Organization", "Scope 2 Emissions"]]
scope2_reporting.rename(columns={"Scope 2 Emissions": "Gross Global Sc2 (ton CO2e)"}, inplace=True)

# Merge into base df
cdp_base_df = cdp_base_df.merge(scope2_reporting, on="Organization", how="left")



In [None]:
# Extract the relevant sheet for Scope 3 (C6.5)
c65_df = CDP["C6.5"]

# Get relevant columns: Organization (column 0) and Scope 3 emissions (column 4)
scope3 = c65_df.iloc[:, [0, 4]].copy()
scope3.columns = ["Organization", "Scope 3 Emissions"]

# Filter out rows where Scope 3 emissions have "Question not applicable"
scope3_valid = scope3[~scope3["Scope 3 Emissions"].str.contains("Question not applicable", na=False)]

# Convert emissions to numeric, ignoring errors (invalid values will become NaN)
# Use .loc to avoid the warning and ensure modification is done in place
scope3_valid.loc[:, "Scope 3 Emissions"] = pd.to_numeric(scope3_valid["Scope 3 Emissions"], errors="coerce")

# Group by Organization and sum the Scope 3 emissions
scope3_summary = scope3_valid.groupby("Organization", as_index=False).sum()

# Rename the Scope 3 emissions column to "Gross Global Sc3 (ton CO2e)"
scope3_summary.rename(columns={"Scope 3 Emissions": "Gross Global Sc3 (ton CO2e)"}, inplace=True)

# Merge the Scope 3 emissions summary into the main dataframe (cdp_base_df)
cdp_base_df = cdp_base_df.merge(scope3_summary, on="Organization", how="left")

# Remove scientific notation for float display
pd.options.display.float_format = '{:,.0f}'.format




In [None]:
# Extract the C6.10 sheet
c610_df = CDP["C6.10"]

# Filter rows where the denominator is 'unit total revenue'
c610_filtered = c610_df[c610_df["C6.10_C3 - Metric denominator"].str.strip().str.lower() == "unit total revenue"]

# Select relevant columns
scope12_numerator = c610_filtered[["Organization", "C6.10_C4 - Metric denominator: Unit total"]].copy()
scope12_numerator.rename(columns={
    "C6.10_C4 - Metric denominator: Unit total": "Revenue total"
}, inplace=True)

# Merge into cdp_base_df
cdp_base_df = cdp_base_df.merge(scope12_numerator, on="Organization", how="left")

In [None]:
# Extract relevant columns from the 'C0.4' sheet
c04_df = CDP["C0.4"]

# Select 'Organization' and currency columns
currency_df = c04_df[["Organization", "C0.4_Select the currency used for all financial information disclosed throughout your response."]].copy()

# Rename columns for clarity
currency_df.rename(columns={
    "C0.4_Select the currency used for all financial information disclosed throughout your response.": "Currency"
}, inplace=True)

# Merge currency info into cdp_base_df
cdp_base_df = cdp_base_df.merge(currency_df, on="Organization", how="left")

In [None]:
# Define currency conversion rates to USD
conversion_rates = {
    "USD": 1.0,
    "NOK": 0.104368,
    "CZK": 0.042870,
    "EUR": 1.053049	,
    "PLN": 0.224940,
    "CHF": 1.047775,
    "DKK": 0.141547
}

# Calculate Scope 1+2
cdp_base_df["Scope 1+2"] = cdp_base_df["Gross Global Sc1 (ton CO2e)"] + cdp_base_df["Gross Global Sc2 (ton CO2e)"]

# Calculate Revenue (M$) in USD
def convert_revenue(row):
    revenue = row["Revenue total"]
    currency = row["Currency"]
    if pd.isna(revenue):
        return None
    rate = conversion_rates.get(currency, 1.0)
    return (revenue * rate) / 1_000_000

cdp_base_df["Revenue (M$)"] = cdp_base_df.apply(convert_revenue, axis=1)
# Set float display to 4 decimal points
pd.options.display.float_format = '{:,.4f}'.format
cdp_base_df

In [None]:
# List of companies to keep
companies_to_keep = [
    "AKENERJİ ELEKTRİK ÜRETİM A.Ş.",
    "Arendals Fossekompani ASA",
    "Atlantica Sustainable Infrastructure PLC",
    "CEZ",
    "EDF",
    "EDP - Energias de Portugal S.A.",
    "Endesa",
    "ERG S.p.A",
    "Ørsted",
    "Polska Grupa Energetyczna (PGE) SA",
    "Romande Energie Holding SA",
    "Scatec ASA",
    "Solaria Energia y Medio Ambiente SA",
    "Terna Energy S.A"
]

# Filter to only include selected companies
cdp_base_df = cdp_base_df[cdp_base_df["Organization"].isin(companies_to_keep)]

cdp_base_df.head()

In [None]:
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import PatternFill
from openpyxl import load_workbook

# Define file path and output path
output_path = "data/CDP/CDP_Final_df_notargets_2023.xlsx"

# Save the DataFrame to Excel
cdp_base_df.to_excel(output_path, index=False, engine="openpyxl")

# Load the workbook and sheet
wb = load_workbook(output_path)
ws = wb.active  # There's only one sheet since we saved just one DataFrame

# Auto-adjust column widths based on the longest string in each column
for col in ws.columns:
    max_length = 0
    col_letter = get_column_letter(col[0].column)
    for cell in col:
        if cell.value:
            max_length = max(max_length, len(str(cell.value)))
    ws.column_dimensions[col_letter].width = max_length + 3  # Add padding

# Define grey fill for alternating rows
grey_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")

# Alternate row colors by company
prev_company = None
use_grey = False
for row in range(2, ws.max_row + 1):
    current_company = ws[f"A{row}"].value  # Column A has the company names
    if current_company != prev_company:
        use_grey = not use_grey
        prev_company = current_company

    if use_grey:
        for col in range(1, ws.max_column + 1):
            ws.cell(row=row, column=col).fill = grey_fill

# Save the final cleaned and formatted workbook
wb.save(output_path)

## Targets

### Absolute target

In [None]:
# Load the CDP Excel file and the specific sheet
file_path = "data/CDP/CDP_2023_short_color_relevant_rows.xlsx"
CDP = pd.read_excel(file_path, sheet_name="C4.1a")  # Load the 'C4.1a' sheet

# List of companies to keep
companies_to_keep = [
    "AKENERJİ ELEKTRİK ÜRETİM A.Ş.",
    "Arendals Fossekompani ASA",
    "Atlantica Sustainable Infrastructure PLC",
    "CEZ",
    "EDF",
    "EDP - Energias de Portugal S.A.",
    "Endesa",
    "ERG S.p.A",
    "Ørsted",
    "Polska Grupa Energetyczna (PGE) SA",
    "Romande Energie Holding SA",
    "Scatec ASA",
    "Solaria Energia y Medio Ambiente SA",
    "Terna Energy S.A"
]

# Select the relevant columns from the sheet
columns_to_keep = [
    "Organization", 
    "C4.1a_C2 - Is this a science-based target?", 
    "C4.1a_C3 - Target ambition", 
    "C4.1a_C4 - Year target was set", 
    "C4.1a_C5 - Target coverage", 
    "C4.1a_C6 - Scope(s)", 
    "C4.1a_C9 - Base year", 
    "C4.1a_C10 - Base year Scope 1 emissions covered by target (metric tons CO2e)", 
    "C4.1a_C11 - Base year Scope 2 emissions covered by target (metric tons CO2e)", 
    "C4.1a_C29 - Base year total Scope 3 emissions covered by target (metric tons CO2e)", 
    "C4.1a_C30 - Total base year emissions covered by target in all selected Scopes (metric tons CO2e)", 
    "C4.1a_C31 - Base year Scope 1 emissions covered by target as % of total base year emissions in Scope 1", 
    "C4.1a_C32 - Base year Scope 2 emissions covered by target as % of total base year emissions in Scope 2", 
    "C4.1a_C50 - Base year total Scope 3 emissions covered by target as % of total base year emissions in Scope 3 (in all Scope 3 categories)", 
    "C4.1a_C51 - Base year emissions covered by target in all selected Scopes as % of total base year emissions in all selected Scopes", 
    "C4.1a_C52 - Target year", 
    "C4.1a_C53 - Targeted reduction from base year (%)", 
    "C4.1a_C54 - Total emissions in target year covered by target in all selected Scopes (metric tons CO2e) [auto-calculated]", 
    "C4.1a_C55 - Scope 1 emissions in reporting year covered by target (metric tons CO2e)", 
    "C4.1a_C56 - Scope 2 emissions in reporting year covered by target (metric tons CO2e)", 
    "C4.1a_C74 - Total Scope 3 emissions in reporting year covered by target (metric tons CO2e)", 
    "C4.1a_C75 - Total emissions in reporting year covered by target in all selected scopes (metric tons CO2e)", 
    "C4.1a_C77 - % of target achieved relative to base year [auto-calculated]"
]

# Extract the columns
cdp_abs_targets_df = CDP[columns_to_keep].copy()

# Rename the columns to shorter names with units, ensuring clarity
cdp_abs_targets_df.rename(columns={
    "C4.1a_C2 - Is this a science-based target?": "Science-based Target",
    "C4.1a_C3 - Target ambition": "Target Ambition",
    "C4.1a_C4 - Year target was set": "Target Year Set",
    "C4.1a_C5 - Target coverage": "Target Coverage",
    "C4.1a_C6 - Scope(s)": "Scope(s)",
    "C4.1a_C9 - Base year": "Base Year",
    "C4.1a_C10 - Base year Scope 1 emissions covered by target (metric tons CO2e)": "Base Year Scope 1 Covered by Target (ton CO2e)",
    "C4.1a_C11 - Base year Scope 2 emissions covered by target (metric tons CO2e)": "Base Year Scope 2 Covered by Target (ton CO2e)",
    "C4.1a_C29 - Base year total Scope 3 emissions covered by target (metric tons CO2e)": "Base Year Scope 3 Covered by Target (ton CO2e)",
    "C4.1a_C30 - Total base year emissions covered by target in all selected Scopes (metric tons CO2e)": "Base Year Total Covered by Target (ton CO2e)",
    "C4.1a_C31 - Base year Scope 1 emissions covered by target as % of total base year emissions in Scope 1": "Base Year Scope 1 Covered (%)",
    "C4.1a_C32 - Base year Scope 2 emissions covered by target as % of total base year emissions in Scope 2": "Base Year Scope 2 Covered (%)",
    "C4.1a_C50 - Base year total Scope 3 emissions covered by target as % of total base year emissions in Scope 3 (in all Scope 3 categories)": "Base Year Scope 3 Covered (%)",
    "C4.1a_C51 - Base year emissions covered by target in all selected Scopes as % of total base year emissions in all selected Scopes": "Base Year Total Covered (%)",
    "C4.1a_C52 - Target year": "Target Year",
    "C4.1a_C53 - Targeted reduction from base year (%)": "Targeted Reduction from Base Year (%)",
    "C4.1a_C54 - Total emissions in target year covered by target in all selected Scopes (metric tons CO2e) [auto-calculated]": "Target Year Total Covered (ton CO2e)",
    "C4.1a_C55 - Scope 1 emissions in reporting year covered by target (metric tons CO2e)": "Scope 1 in Reporting Year Covered (ton CO2e)",
    "C4.1a_C56 - Scope 2 emissions in reporting year covered by target (metric tons CO2e)": "Scope 2 in Reporting Year Covered (ton CO2e)",
    "C4.1a_C74 - Total Scope 3 emissions in reporting year covered by target (metric tons CO2e)": "Scope 3 in Reporting Year Covered (ton CO2e)",
    "C4.1a_C75 - Total emissions in reporting year covered by target in all selected scopes (metric tons CO2e)": "Total in Reporting Year Covered (ton CO2e)",
    "C4.1a_C77 - % of target achieved relative to base year [auto-calculated]": "Target Achieved (%)"
}, inplace=True)

# Filter to only include selected companies
cdp_abs_targets_df = cdp_abs_targets_df[cdp_abs_targets_df["Organization"].isin(companies_to_keep)]

cdp_abs_targets_df

In [None]:
# Change 'Question not applicable' to 'Q not app.'
cdp_abs_targets_df.replace('Question not applicable', 'Q not app.', inplace=True)

# For the 'Science-based Target' column, extract only 'Yes' or 'No' from the full text
cdp_abs_targets_df['Science-based Target'] = cdp_abs_targets_df['Science-based Target'].apply(lambda x: 'Yes' if 'Yes' in str(x) else ('No' if 'No' in str(x) else x))

# For the 'Target Ambition' column, clean up entries like 'Other, please specify: ...'
cdp_abs_targets_df['Target Ambition'] = cdp_abs_targets_df['Target Ambition'].str.replace(r"Other, please specify: (.*)", r"\1", regex=True)

# Remove scientific notation for float display
pd.options.display.float_format = '{:,.0f}'.format

# Display the updated DataFrame
cdp_abs_targets_df.head()


In [None]:
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import PatternFill
from openpyxl import load_workbook

# Define file path and output path
output_path = "data/CDP/CDP_Abs_Targets_2023.xlsx"

# Save the DataFrame to Excel
cdp_abs_targets_df.to_excel(output_path, index=False, engine="openpyxl")

# Load the workbook and sheet
wb = load_workbook(output_path)
ws = wb.active  # There's only one sheet since we saved just one DataFrame

# Auto-adjust column widths based on the longest string in each column
for col in ws.columns:
    max_length = 0
    col_letter = get_column_letter(col[0].column)
    for cell in col:
        if cell.value:
            max_length = max(max_length, len(str(cell.value)))
    ws.column_dimensions[col_letter].width = max_length + 3  # Add padding

# Define grey fill for alternating rows
grey_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")

# Alternate row colors by company
prev_company = None
use_grey = False
for row in range(2, ws.max_row + 1):
    current_company = ws[f"A{row}"].value  # Column A has the company names
    if current_company != prev_company:
        use_grey = not use_grey
        prev_company = current_company

    if use_grey:
        for col in range(1, ws.max_column + 1):
            ws.cell(row=row, column=col).fill = grey_fill

# Save the final cleaned and formatted workbook
wb.save(output_path)


### Intensity target

In [None]:
# Load the CDP Excel file and the specific sheet
file_path = "data/CDP/CDP_2023_short_color_relevant_rows.xlsx"
CDP = pd.read_excel(file_path, sheet_name="C4.1b")  # Load the 'C4.1b' sheet

# List of companies to keep
companies_to_keep = [
    "AKENERJİ ELEKTRİK ÜRETİM A.Ş.",
    "Arendals Fossekompani ASA",
    "Atlantica Sustainable Infrastructure PLC",
    "CEZ",
    "EDF",
    "EDP - Energias de Portugal S.A.",
    "Endesa",
    "ERG S.p.A",
    "Ørsted",
    "Polska Grupa Energetyczna (PGE) SA",
    "Romande Energie Holding SA",
    "Scatec ASA",
    "Solaria Energia y Medio Ambiente SA",
    "Terna Energy S.A"
]

# Select the relevant columns from the sheet
columns_to_keep = [
    "Organization", 
    "C4.1b_C2 - Is this a science-based target?", 
    "C4.1b_C3 - Target ambition", 
    "C4.1b_C4 - Year target was set", 
    "C4.1b_C5 - Target coverage", 
    "C4.1b_C6 - Scope(s)", 
    "C4.1b_C9 - Intensity metric", 
    "C4.1b_C10 - Base year", 
    "C4.1b_C11 - Intensity figure in base year for Scope 1 (metric tons CO2e per unit of activity)", 
    "C4.1b_C12 - Intensity figure in base year for Scope 2 (metric tons CO2e per unit of activity)", 
    "C4.1b_C30 - Intensity figure in base year for total Scope 3 (metric tons CO2e per unit of activity)", 
    "C4.1b_C31 - Intensity figure in base year for all selected Scopes (metric tons CO2e per unit of activity)", 
    "C4.1b_C32 - % of total base year emissions in Scope 1 covered by this Scope 1 intensity figure", 
    "C4.1b_C33 - % of total base year emissions in Scope 2 covered by this Scope 2 intensity figure", 
    "C4.1b_C51 - % of total base year emissions in Scope 3 (in all Scope 3 categories) covered by this total Scope 3 intensity figure", 
    "C4.1b_C52 - % of total base year emissions in all selected Scopes covered by this intensity figure", 
    "C4.1b_C53 - Target year", 
    "C4.1b_C54 - Targeted reduction from base year (%)", 
    "C4.1b_C55 - Intensity figure in target year for all selected Scopes (metric tons CO2e per unit of activity) [auto-calculated]", 
    "C4.1b_C56 - % change anticipated in absolute Scope 1+2 emissions", 
    "C4.1b_C57 - % change anticipated in absolute Scope 3 emissions", 
    "C4.1b_C58 - Intensity figure in reporting year for Scope 1 (metric tons CO2e per unit of activity)", 
    "C4.1b_C59 - Intensity figure in reporting year for Scope 2 (metric tons CO2e per unit of activity)", 
    "C4.1b_C77 - Intensity figure in reporting year for total Scope 3 (metric tons CO2e per unit of activity)", 
    "C4.1b_C78 - Intensity figure in reporting year for all selected Scopes (metric tons CO2e per unit of activity)", 
    "C4.1b_C80 - % of target achieved relative to base year [auto-calculated]"
]

# Extract the columns
cdp_intensity_targets_df = CDP[columns_to_keep].copy()

# Rename the columns to shorter names with units, ensuring clarity
cdp_intensity_targets_df.rename(columns={
    "C4.1b_C2 - Is this a science-based target?": "Science-based Target",
    "C4.1b_C3 - Target ambition": "Target Ambition",
    "C4.1b_C4 - Year target was set": "Target Year Set",
    "C4.1b_C5 - Target coverage": "Target Coverage",
    "C4.1b_C6 - Scope(s)": "Scope(s)",
    "C4.1b_C9 - Intensity metric": "Intensity Metric",
    "C4.1b_C10 - Base year": "Base Year",
    "C4.1b_C11 - Intensity figure in base year for Scope 1 (metric tons CO2e per unit of activity)": "Base Year Scope 1 Intensity (ton CO2e/unit)",
    "C4.1b_C12 - Intensity figure in base year for Scope 2 (metric tons CO2e per unit of activity)": "Base Year Scope 2 Intensity (ton CO2e/unit)",
    "C4.1b_C30 - Intensity figure in base year for total Scope 3 (metric tons CO2e per unit of activity)": "Base Year Scope 3 Intensity (ton CO2e/unit)",
    "C4.1b_C31 - Intensity figure in base year for all selected Scopes (metric tons CO2e per unit of activity)": "Base Year Total Intensity (ton CO2e/unit)",
    "C4.1b_C32 - % of total base year emissions in Scope 1 covered by this Scope 1 intensity figure": "Base Year Scope 1 Intensity Coverage (%)",
    "C4.1b_C33 - % of total base year emissions in Scope 2 covered by this Scope 2 intensity figure": "Base Year Scope 2 Intensity Coverage (%)",
    "C4.1b_C51 - % of total base year emissions in Scope 3 (in all Scope 3 categories) covered by this total Scope 3 intensity figure": "Base Year Scope 3 Intensity Coverage (%)",
    "C4.1b_C52 - % of total base year emissions in all selected Scopes covered by this intensity figure": "Base Year Total Intensity Coverage (%)",
    "C4.1b_C53 - Target year": "Target Year",
    "C4.1b_C54 - Targeted reduction from base year (%)": "Targeted Reduction from Base Year (%)",
    "C4.1b_C55 - Intensity figure in target year for all selected Scopes (metric tons CO2e per unit of activity) [auto-calculated]": "Target Year Total Intensity (ton CO2e/unit)",
    "C4.1b_C56 - % change anticipated in absolute Scope 1+2 emissions": "Scope 1+2 Change Anticipated (%)",
    "C4.1b_C57 - % change anticipated in absolute Scope 3 emissions": "Scope 3 Change Anticipated (%)",
    "C4.1b_C58 - Intensity figure in reporting year for Scope 1 (metric tons CO2e per unit of activity)": "Reporting Year Scope 1 Intensity (ton CO2e/unit)",
    "C4.1b_C59 - Intensity figure in reporting year for Scope 2 (metric tons CO2e per unit of activity)": "Reporting Year Scope 2 Intensity (ton CO2e/unit)",
    "C4.1b_C77 - Intensity figure in reporting year for total Scope 3 (metric tons CO2e per unit of activity)": "Reporting Year Scope 3 Intensity (ton CO2e/unit)",
    "C4.1b_C78 - Intensity figure in reporting year for all selected Scopes (metric tons CO2e per unit of activity)": "Reporting Year Total Intensity (ton CO2e/unit)",
    "C4.1b_C80 - % of target achieved relative to base year [auto-calculated]": "Target Achieved (%)"
}, inplace=True)

# Filter to only include selected companies
cdp_intensity_targets_df = cdp_intensity_targets_df[cdp_intensity_targets_df["Organization"].isin(companies_to_keep)]

cdp_intensity_targets_df.head()


In [None]:
# Change 'Question not applicable' to 'Q not app.'
cdp_intensity_targets_df.replace('Question not applicable', 'Q not app.', inplace=True)

# For the 'Science-based Target' column, extract only 'Yes' or 'No' from the full text
cdp_intensity_targets_df['Science-based Target'] = cdp_intensity_targets_df['Science-based Target'].apply(lambda x: 'Yes' if 'Yes' in str(x) else ('No' if 'No' in str(x) else x))

# For the 'Intensity Metric' column, clean up entries like 'Other, please specify: ...'
cdp_intensity_targets_df['Intensity Metric'] = cdp_intensity_targets_df['Intensity Metric'].str.replace(r"Other, please specify: (.*)", r"\1", regex=True)

# For the 'Target Ambition' column, clean up entries like 'Other, please specify: ...'
cdp_abs_targets_df['Target Ambition'] = cdp_abs_targets_df['Target Ambition'].str.replace(r"Other, please specify: (.*)", r"\1", regex=True)

# Remove scientific notation for float display
pd.options.display.float_format = '{:,.0f}'.format

# Display the updated DataFrame
cdp_intensity_targets_df.head()



In [None]:
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import PatternFill
from openpyxl import load_workbook

# Define file path and output path
output_path = "data/CDP/CDP_Intensity_Targets_2023.xlsx"

# Save the DataFrame to Excel
cdp_intensity_targets_df.to_excel(output_path, index=False, engine="openpyxl")

# Load the workbook and sheet
wb = load_workbook(output_path)
ws = wb.active  # There's only one sheet since we saved just one DataFrame

# Auto-adjust column widths based on the longest string in each column
for col in ws.columns:
    max_length = 0
    col_letter = get_column_letter(col[0].column)
    for cell in col:
        if cell.value:
            max_length = max(max_length, len(str(cell.value)))
    ws.column_dimensions[col_letter].width = max_length + 3  # Add padding

# Define grey fill for alternating rows
grey_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")

# Alternate row colors by company
prev_company = None
use_grey = False
for row in range(2, ws.max_row + 1):
    current_company = ws[f"A{row}"].value  # Column A has the company names
    if current_company != prev_company:
        use_grey = not use_grey
        prev_company = current_company

    if use_grey:
        for col in range(1, ws.max_column + 1):
            ws.cell(row=row, column=col).fill = grey_fill

# Save the final cleaned and formatted workbook
wb.save(output_path)
