# Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re

# Detecting Columns

In [None]:
import pandas as pd
import numpy as np

def find_header_row(file_path, num_rows_to_check=50):

    try:
        temp_df = pd.read_excel(file_path, header=None, nrows=num_rows_to_check)

        best_header_row_index = 0
        max_score = -1
        for i in range(len(temp_df)):
            potential_header_series = temp_df.iloc[i]
            numeric_check = pd.to_numeric(potential_header_series, errors='coerce')
            non_numeric_count = numeric_check.isna().sum()
            row_as_strings = potential_header_series.astype(str)
            unique_non_empty_strings_count = row_as_strings[row_as_strings.str.strip() != ''].nunique()
            current_score = (unique_non_empty_strings_count * 3) + non_numeric_count
            if current_score > max_score:
                max_score = current_score
                best_header_row_index = i

        print(f"Detected header row (0-indexed): {best_header_row_index}")
        return best_header_row_index

    except Exception as e:
        print(f"Error occurred during header detection: {e}")
        return 0


file_path = 'auditsamplev3.xlsx'

header_row_index = find_header_row(file_path)
df = pd.read_excel(file_path, header=header_row_index)
columns = df.columns

print("\nColumns in the DataFrame:")
print(columns)

columns_list = columns.tolist()
print("\nColumns as a Python list:")
print(columns_list)


In [None]:
df.head()

# Data Cleaning

## 1. Standardize column names: Remove leading/trailing whitespace

In [None]:
df.columns = df.columns.str.strip()
print("Columns stripped of whitespace.")
print(f"Current columns after stripping: {df.columns.tolist()}")
df_cleaned = df.copy()
print("Created a copy of the DataFrame for cleaning.")

## 2. Handle 'N/A' values

In [None]:
na_values_to_replace = ['N/A', 'n/a', 'NA', 'N.A.', 'not applicable', '-']
for col in df_cleaned.select_dtypes(include='object').columns:
        df_cleaned[col] = df_cleaned[col].astype(str)
        df_cleaned[col] = df_cleaned[col].replace(na_values_to_replace, np.nan)
        df_cleaned[col] = df_cleaned[col].replace(r'^\s*$', np.nan, regex=True)

print("Common 'N/A' and empty string values replaced with NaN.")

## 3. Identify and filter 'Dead Links / Redirects'

In [None]:
traffic_volume_col = 'Level of Traffic Volume'
if traffic_volume_col in df_cleaned.columns:
        inactive_site_indicators = ['N/A - Dead Links / Redirects', 'Dead Links', 'Redirects']
        active_sites = df_cleaned[~df_cleaned[traffic_volume_col].isin(inactive_site_indicators)].copy()
        print(f"Filtered out inactive sites based on '{traffic_volume_col}'.")
        print(f"Original rows: {len(df_cleaned)}, Active rows: {len(active_sites)}")
else:
        print(f"Warning: Column '{traffic_volume_col}' not found. Skipping filtering for active sites.")
        active_sites = df_cleaned.copy()

## 4. Handle 'Any Score' column

In [None]:
def parse_score_advanced(score_str):

    if pd.isna(score_str) or str(score_str).strip() == '' or str(score_str).strip().lower() in [s.lower() for s in na_values_to_replace]:
        return np.nan
    s = str(score_str).strip().lower()

    # Case 1: "X out of Y" format (e.g., "75 out of 100", "15.5 out of 20")
    match_out_of = re.match(r'(\d+(?:\.\d+)?)\s*out of\s*(\d+(?:\.\d+)?)', s)
    if match_out_of:
        try:
            numerator = float(match_out_of.group(1))
            denominator = float(match_out_of.group(2))
            if denominator != 0:
                return (numerator / denominator) * 100.0
            else:
                return np.nan
        except ValueError:
            pass

    # Case 2: Percentage format (e.g., "85.5 %", "92%")
    match_percent = re.match(r'(\d+(?:\.\d+)?)\s*%', s)
    if match_percent:
        try:
            return float(match_percent.group(1)) * 1.0
        except ValueError:
            pass

    # Case 3: Normal numeric format (e.g., "75", "120.5")
    try:
        return float(s)
    except ValueError:
        pass # Fall through if conversion fails


    return np.nan

score_columns = [col for col in active_sites.columns if 'score' in col.lower()]

if score_columns:
    print(f"Found score-related columns: {score_columns}")
    for col_name in score_columns:
        active_sites[col_name] = active_sites[col_name].astype(str)
        active_sites[col_name] = active_sites[col_name].apply(parse_score_advanced)
        print(f"Cleaned '{col_name}' and converted to float (double).")
        print(f"'{col_name}' Dtype: {active_sites[col_name].dtype}")
else:
    print("No columns containing 'score' were found. Skipping cleaning for such columns.")


print("\n--- Cleaned 'Score' columns preview ---")
print(active_sites[score_columns] if score_columns else active_sites)
print("\nData types after Score column cleaning:")
print(active_sites.info())

## 5. Normalize Columns

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def normalize_col_name(col_name):
    normalized = col_name.lower().strip()
    normalized = normalized.replace(' / ', '/').replace(' ', '_')
    return normalized


def sanitize_filename(filename):
    sanitized = re.sub(r'[^\w\s-]', '_', filename.strip())
    sanitized = re.sub(r'\s+', '_', sanitized)
    return sanitized

## 6. Remove Duplicate Websites

In [None]:
print("Original DataFrame shape:", active_sites.shape)
# active_sites = active_sites.drop_duplicates(subset=['Website Name / Domain Name'], keep='first')
print("\nDataFrame shape after removing duplicates:", active_sites.shape)


## 7. General categorical column cleaning

In [None]:
categorical_cols = active_sites.select_dtypes(include=['object']).columns

if len(categorical_cols) > 0:
    print(f"\nFound potential categorical columns for cleaning: {list(categorical_cols)}")
    for col_name in categorical_cols:
        active_sites[col_name] = active_sites[col_name].astype(str).str.strip()
        active_sites[col_name] = active_sites[col_name].replace(r'^\s*$', np.nan, regex=True)
        print(f"Cleaned '{col_name}' by stripping whitespace and replacing empty strings with NaN.")
else:
    print("\nNo columns with 'object' dtype found to clean as categorical.")


print("\n--- Cleaned DataFrame Info ---")
active_sites.info()
print("\n--- Cleaned DataFrame Head ---")
print(active_sites.head())

## 8. General numerical column cleaning

In [None]:
for col in active_sites.columns:
        converted_col = pd.to_numeric(active_sites[col], errors='coerce')
        if not converted_col.isnull().all() and converted_col.dtype != active_sites[col].dtype:
            active_sites[col] = converted_col
            print(f"Attempted to convert column '{col}' to numeric. New Dtype: {active_sites[col].dtype}")

print("\n--- Cleaning Summary ---")
print("First 5 rows of the cleaned 'active_sites' DataFrame:")
print(active_sites.head())
print("\nData types of 'active_sites' DataFrame:")
print(active_sites.info())
print(f"\nShape of the cleaned DataFrame (active_sites): {active_sites.shape}")



In [None]:
active_sites.head()

In [None]:
active_sites.shape

# Data Visualization

# Chart Generation

# PPT Integration

In [None]:
# !pip install requests
# !pip install python-pptx

### Generating PPT

## Changes

In [None]:
import asyncio
import io
import re
from typing import Any, List, Optional, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
from pptx import Presentation
from pptx.dml.color import RGBColor
from pptx.enum.text import PP_ALIGN
from pptx.util import Emu, Pt

# --- Utility Functions ---

def normalize_col_name(col_name: str) -> str:
    """Normalize column name for comparison."""
    return col_name.lower().replace('_', '').replace('-', '').replace(' ', '')

def get_risk_counts(scores: pd.Series) -> Tuple[int, int, int]:
    """Return count of high, medium, low risk."""
    high = (scores < 60).sum()
    medium = ((scores >= 60) & (scores < 90)).sum()
    low = (scores >= 90).sum()
    return high, medium, low

def describe_distribution(series: pd.Series) -> str:
    skew = series.skew()
    if skew > 0.5:
        return "right-skewed (most values below average)"
    elif skew < -0.5:
        return "left-skewed (most values above average)"
    else:
        return "normally distributed"

def generate_bullet_points_for_chart(
    df: pd.DataFrame, df_original: pd.DataFrame, col: str, chart_type: str
) -> List[str]:
    """Generate contextual bullet points based on data analysis."""
    bullet_points = []
    valid_data = df[col].dropna()
    if chart_type == "risk_distribution" and not valid_data.empty:
        high, medium, low = get_risk_counts(valid_data)
        total = len(valid_data)
        high_pct, low_pct = (high/total)*100, (low/total)*100
        mean_score = valid_data.mean()
        bullet_points = [
            f"Total {total} records analyzed with average score of {mean_score:.1f}",
            f"High risk sites represent {high_pct:.1f}% ({high}) of total records",
            f"Low risk sites account for {low_pct:.1f}% ({low}) of total records",
            f"Immediate attention needed for {high} high-risk sites"
        ]
    elif chart_type == "score_distribution" and not valid_data.empty:
        mean, median, std = valid_data.mean(), valid_data.median(), valid_data.std()
        dist_desc = describe_distribution(valid_data)
        min_val, max_val = valid_data.min(), valid_data.max()
        bullet_points = [
            f"Scores range from {min_val:.1f} to {max_val:.1f} with a mean of {mean:.1f}.",
            f"Standard deviation of {std:.1f} shows {'high' if std > mean * 0.3 else 'moderate'} variability.",
            f"The distribution is {dist_desc}.",
            f"Median of {median:.1f} is {'close to' if abs(mean-median)<std*0.1 else 'distinct from'} the mean."
        ]
    elif chart_type == "numerical_distribution" and not valid_data.empty:
        mean, median = valid_data.mean(), valid_data.median()
        q25, q75 = valid_data.quantile(0.25), valid_data.quantile(0.75)
        bullet_points = [
            f"The dataset has {len(valid_data)} valid entries and {len(df_original)-len(valid_data)} dead links (about {((len(df_original)-len(valid_data))/len(df_original))*100:.1f}%)",
            f"The average value is {mean:.2f}, with a median of {median:.2f}.",
            f"The 25th percentile is {q25:.2f}, and the 75th percentile is {q75:.2f}.",
            f"The middle 50% of data lies between {q25:.2f} and {q75:.2f}."
        ]
    elif chart_type == "categorical_pie":
        value_counts = df[col].value_counts()
        total_count = len(valid_data)
        top_category, top_count = value_counts.index[0], value_counts.iloc[0]
        top_pct = (top_count/total_count)*100
        bullet_points = [
            f"The dataset contains {len(value_counts)} distinct categories across {total_count} records.",
            f"The dominant category '{top_category}' has {top_count} occurrences ({top_pct:.1f}%).",
            f"The distribution is {'relatively uniform' if top_pct < 40 else 'skewed towards a few dominant categories'}.",     
            f"Category representation is {'fairly consistent' if value_counts.std() < value_counts.mean() * 0.5 else 'highly variable'}."
        ]
    elif chart_type == "categorical_bar":
        value_counts = df[col].value_counts()
        total_count = len(valid_data)
        top_10_count = value_counts.head(10).sum()
        top_category, top_pct = value_counts.index[0], (value_counts.iloc[0]/total_count)*100
        bullet_points = [
            f"The column {col} has {len(value_counts)} unique categories across {total_count} records.",
            f"The top category '{top_category}' accounts for {top_pct:.1f}% of the data.",
            f"The top 10 categories make up {(top_10_count/total_count)*100:.1f}% of all entries.",
            f"The categories are {'spread out across many values' if len(value_counts) > total_count * 0.5 else 'mostly focused on a few values'}."
        ]
    return bullet_points[:4]

def generate_hexbin_bullet_points(df: pd.DataFrame, x_col: str, y_col: str) -> List[str]:
    corr = df[x_col].corr(df[y_col])
    return [
        "Darker hexagons indicate denser data regions, while lighter ones show sparser observations.",
        f"A {'positive' if corr > 0 else 'negative' if corr < 0 else 'no'} linear trend exists (Pearson correlation coefficient: {corr:.2f}).",
        f"{x_col} ranges from {df[x_col].min():.1f} to {df[x_col].max():.1f}, and {y_col} ranges from {df[y_col].min():.1f} to {df[y_col].max():.1f}."
    ]

def add_conclusion(df: pd.DataFrame) -> List[str]:
    """Generate summary insights based on risk and volume columns."""
    volume_cols = [col for col in df.columns if "volume" in col.lower()]
    str_volume_cols = [col for col in volume_cols if df[col].apply(type).eq(str).any()]
    score_cols = [col for col in df.columns if "score" in col.lower()]
    name_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in ["website", "domain", "site", "link"])]

    matching_rows = pd.DataFrame()
    for vol_col in str_volume_cols:
        for score_col in score_cols:
            mask = df[vol_col].str.contains("high", case=False, na=False) & (df[score_col] < 50)
            filtered = df.loc[mask, name_cols]
            matching_rows = pd.concat([matching_rows, filtered], ignore_index=True)
    result = matching_rows.drop_duplicates().head(10)
    if not result.empty:
        first_name_col = result.columns[0]
        top_sites = result[first_name_col].dropna().unique()[:9]
        if len(top_sites) > 0:
            site_list = ', '.join(top_sites[:-1]) + f', and {top_sites[-1]}' if len(top_sites) > 1 else top_sites[0]
            return [
                f"The current dataset includes the following key columns: {', '.join(df.columns)}",
                f"The sites {site_list} are considered high-risk given their low {score_cols[0].lower()} despite having high {str_volume_cols[0].lower()}."
            ]
    return [f"The current dataset includes the following key columns: {', '.join(df.columns)}"]

In [None]:
# --- Visualization Functions ---

def plot_numerical_histogram(ax, series: pd.Series, col: str, clean_title: str):
    mean_val, median_val, std_val = series.mean(), series.median(), series.std()
    sns.histplot(series, kde=True, color='cornflowerblue', bins=30, ax=ax)
    ax.axvline(mean_val, color='blue', linestyle='--', label=f'Mean: {mean_val:.2f}')
    ax.axvline(median_val, color='red', linestyle='--', label=f'Median: {median_val:.2f}')
    ax.axvline(mean_val + std_val, color='purple', linestyle=':', label=f'+-1 Std Dev: {std_val:.2f}')
    ax.axvline(mean_val - std_val, color='purple', linestyle=':')
    ax.set_title(f'Distribution of {clean_title}', fontsize=16, pad=20)
    ax.set_xlabel(col, fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)
    ax.legend()

def plot_categorical_bar(ax, value_counts: pd.Series, col: str, clean_title: str):
    max_label_length = 30
    sorted_index = value_counts.reset_index()
    sorted_index.columns = ['label', 'count']
    sorted_index["label_length"] = sorted_index["label"].astype(str).apply(len)
    sorted_index = sorted_index.sort_values(by=["count", "label_length", "label"], ascending=[False, True, True])
    top_10_df = sorted_index.head(10).copy()
    top_10_df["short_label"] = top_10_df["label"].astype(str).apply(
        lambda x: x if len(x) <= max_label_length else x[:max_label_length] + "…"
    )
    sns.barplot(
        x=top_10_df["count"],
        y=top_10_df["short_label"],
        hue=top_10_df["short_label"],
        palette="Set2",
        orient="h",
        ax=ax,
        legend=False
    )
    ax.set_title(f'Top 10 Most Frequent Values in {clean_title}', fontsize=16, pad=20)
    ax.set_xlabel('Count', fontsize=12)
    ax.set_ylabel(col, fontsize=12)

def visualize_column_summary(
    active_sites_df: pd.DataFrame, df_original: pd.DataFrame
) -> Tuple[List[Tuple[Any, str, List[str]]], List[str]]:
    if not isinstance(active_sites_df, pd.DataFrame):
        print("Error: Input is not a valid pandas DataFrame.")
        return [], []
    sns.set_style("whitegrid")
    cols_to_exclude = [normalize_col_name('id')]
    score_keyword = 'score'
    cols_for_viz = [col for col in active_sites_df.columns if normalize_col_name(col) not in cols_to_exclude]
    chart_data = []
    num_rows = len(active_sites_df)

    for col in cols_for_viz:
        clean_title = col.replace('_', ' ').replace('-', ' ').title()
        if pd.api.types.is_numeric_dtype(active_sites_df[col]):
            if score_keyword in normalize_col_name(col):
                valid_scores = active_sites_df[col].dropna()
                if not valid_scores.empty:
                    # Pie chart: risk distribution
                    fig1, ax1 = plt.subplots(figsize=(10, 6))
                    high, medium, low = get_risk_counts(valid_scores)
                    risk_counts = [high, medium, low]
                    risk_labels = [
                        f'High Risk (< 60)', f'Medium Risk (60-89)', f'Low Risk (>= 90)'
                    ]
                    colors = ['#FF6347', '#FFD700', '#90EE90']
                    explode_values = [0.05 if i == 0 and high > 0 else 0 for i in range(3)]
                    ax1.pie(
                        risk_counts, labels=risk_labels, autopct='%1.1f%%', startangle=140,
                        colors=colors, wedgeprops={'edgecolor': 'white'}, shadow=True, explode=explode_values
                    )
                    ax1.set_title(f'{clean_title} Risk Distribution', fontsize=16, pad=20)
                    ax1.set_ylabel('')
                    fig1.tight_layout()
                    chart_data.append((
                        fig1, f"{clean_title} Risk Distribution",
                        generate_bullet_points_for_chart(active_sites_df, df_original, col, "risk_distribution")
                    ))
                    plt.close(fig1)

                    # Histogram: score distribution
                    fig2, ax2 = plt.subplots(figsize=(10, 6))
                    plot_numerical_histogram(ax2, valid_scores, col, clean_title)
                    fig2.tight_layout()
                    chart_data.append((
                        fig2, f"{clean_title} Distribution",
                        generate_bullet_points_for_chart(active_sites_df, df_original, col, "score_distribution")
                    ))
                    plt.close(fig2)
            else:
                fig, ax = plt.subplots(figsize=(10, 6))
                plot_numerical_histogram(ax, active_sites_df[col].dropna(), col, clean_title)
                fig.tight_layout()
                chart_data.append((
                    fig, f"{clean_title} Distribution",
                    generate_bullet_points_for_chart(active_sites_df, df_original, col, "numerical_distribution")
                ))
                plt.close(fig)
        else:
            unique_count = active_sites_df[col].nunique()
            if unique_count == num_rows or unique_count == 0:
                continue  # Skip unique or empty
            if unique_count <= 5:
                fig, ax = plt.subplots(figsize=(10, 6))
                value_counts = active_sites_df[col].value_counts()
                explode_values = [0.05] * len(value_counts)
                ax.pie(
                    value_counts, labels=value_counts.index, autopct='%1.1f%%', startangle=140,
                    wedgeprops={'edgecolor': 'white'}, shadow=True, explode=explode_values
                )
                ax.set_title(f'Distribution of {clean_title}', fontsize=16, pad=20)
                ax.set_ylabel('')
                fig.tight_layout()
                chart_data.append((
                    fig, f"{clean_title} Distribution",
                    generate_bullet_points_for_chart(active_sites_df, df_original, col, "categorical_pie")
                ))
                plt.close(fig)
            else:
                fig, ax = plt.subplots(figsize=(10, 6))
                value_counts = active_sites_df[col].value_counts()
                plot_categorical_bar(ax, value_counts, col, clean_title)
                fig.tight_layout()
                chart_data.append((
                    fig, f"Top 10 {clean_title} Values",
                    generate_bullet_points_for_chart(active_sites_df, df_original, col, "categorical_bar")
                ))
                plt.close(fig)

    # Hexbin plot (traffic vs score)
    traffic_col, score_col = None, None
    for col in active_sites_df.columns:
        norm_col = normalize_col_name(col)
        if pd.api.types.is_numeric_dtype(active_sites_df[col]):
            if 'traffic' in norm_col:
                traffic_col = col
            elif 'score' in norm_col:
                score_col = col
    if traffic_col and score_col:
        x, y = active_sites_df[traffic_col].dropna(), active_sites_df[score_col].dropna()
        df_hex = pd.DataFrame({traffic_col: x, score_col: y}).dropna()
        if not df_hex.empty:
            fig, ax = plt.subplots(figsize=(10, 6))
            hb = ax.hexbin(df_hex[traffic_col], df_hex[score_col], gridsize=30, cmap='viridis_r', mincnt=1)
            cb = fig.colorbar(hb, ax=ax)
            cb.set_label('Count')
            ax.set_xlabel(traffic_col)
            ax.set_ylabel(score_col)
            ax.set_title(f'{traffic_col} vs {score_col} Hexbin Plot', fontsize=16, pad=20)
            fig.tight_layout()
            chart_title = f"{traffic_col.replace('_',' ').title()} vs {score_col.replace('_',' ').title()} Relationship"
            chart_data.append((fig, chart_title, generate_hexbin_bullet_points(df_hex, traffic_col, score_col)))
            plt.close(fig)

    return chart_data, add_conclusion(active_sites_df)


In [None]:
# --- PPTX Creation Functions ---

# Layout Constants
TITLE_FONT_NAME = 'Times New Roman'
TITLE_FONT_COLOR = RGBColor(112, 48, 160)
TITLE_FONT_SIZE = Pt(36)
BULLET_FONT_NAME = 'Calibri'
BULLET_FONT_SIZE = Pt(20)
BULLET_SPACE_AFTER = Pt(12)
TOP_MARGIN_RATIO = 0.06
BOTTOM_MARGIN_RATIO = 0.08
LEFT_MARGIN_RATIO = 0.06
RIGHT_MARGIN_RATIO = 0.06
TITLE_HEIGHT_RATIO = 0.20
CONTENT_HEIGHT_RATIO = 1 - TITLE_HEIGHT_RATIO
TITLE_WIDTH_RATIO = 0.70
TEXT_WIDTH_RATIO = 0.30
IMAGE_WIDTH_RATIO = 0.70
GAP_BETWEEN_TEXT_AND_IMAGE_RATIO = 0.02

def bolden_values_paragraph(p, text):
    """Add bullet point with bolded numbers/percentages in the paragraph p."""
    pattern = re.compile(r"(\d[\d,\.]*%?|\([\d,\.]+\)|\b[\w\-]+(?:\.[\w\-]+)+\b)")
    last = 0
    for match in pattern.finditer(text):
        if match.start() > last:
            run = p.add_run()
            run.text = text[last:match.start()]
            run.font.bold = False
            run.font.size = BULLET_FONT_SIZE
            run.font.name = BULLET_FONT_NAME
        run = p.add_run()
        run.text = match.group(0)
        run.font.bold = True
        run.font.size = BULLET_FONT_SIZE
        run.font.name = BULLET_FONT_NAME
        last = match.end()
    if last < len(text):
        run = p.add_run()
        run.text = text[last:]
        run.font.bold = False
        run.font.size = BULLET_FONT_SIZE
        run.font.name = BULLET_FONT_NAME

def add_custom_chart_slide(
    prs: Presentation,
    chart_fig,
    chart_title: str,
    bullet_points: List[str]
):
    slide_width, slide_height = prs.slide_width, prs.slide_height
    usable_width = slide_width * (1 - LEFT_MARGIN_RATIO - RIGHT_MARGIN_RATIO)
    usable_height = slide_height * (1 - TOP_MARGIN_RATIO - BOTTOM_MARGIN_RATIO)
    usable_left = slide_width * LEFT_MARGIN_RATIO
    usable_top = slide_height * TOP_MARGIN_RATIO
    slide = prs.slides.add_slide(prs.slide_layouts[6])  # Blank layout

    # Title
    title_width = slide_width * TITLE_WIDTH_RATIO
    title_left = (slide_width - title_width) / 2
    title_top = usable_top
    title_height = usable_height * TITLE_HEIGHT_RATIO
    title_box = slide.shapes.add_textbox(
        Emu(title_left), Emu(title_top), Emu(title_width), Emu(title_height)
    )
    tf = title_box.text_frame
    tf.text = chart_title
    tf.word_wrap = True
    para = tf.paragraphs[0]
    para.alignment = PP_ALIGN.CENTER
    run = para.runs[0]
    run.font.size = TITLE_FONT_SIZE
    run.font.bold = True
    run.font.name = TITLE_FONT_NAME
    run.font.color.rgb = TITLE_FONT_COLOR

    # Bullets
    content_top = usable_top + usable_height * TITLE_HEIGHT_RATIO
    content_height = usable_height * CONTENT_HEIGHT_RATIO
    gap_width = usable_width * GAP_BETWEEN_TEXT_AND_IMAGE_RATIO
    text_left = Emu(usable_left)
    text_top = Emu(content_top)
    text_width = Emu(usable_width * TEXT_WIDTH_RATIO - gap_width / 2)
    text_height = Emu(content_height)
    bullet_box = slide.shapes.add_textbox(text_left, text_top, text_width, text_height)
    tf_bullets = bullet_box.text_frame
    tf_bullets.word_wrap = True
    for idx, txt in enumerate(bullet_points):
        p = tf_bullets.paragraphs[0] if idx == 0 else tf_bullets.add_paragraph()
        p.level = 0
        p.space_after = BULLET_SPACE_AFTER
        p.alignment = PP_ALIGN.LEFT
        bolden_values_paragraph(p, u"\u2022 " + txt)

    # Image
    image_left = Emu(usable_left + usable_width * TEXT_WIDTH_RATIO + gap_width / 2)
    image_top = Emu(content_top)
    image_width = Emu(usable_width * IMAGE_WIDTH_RATIO - gap_width / 2)
    image_height = Emu(content_height)
    img_buffer = io.BytesIO()
    chart_fig.savefig(img_buffer, format='png', dpi=200, bbox_inches='tight')
    img_buffer.seek(0)
    plt.close(chart_fig)
    img = Image.open(img_buffer)
    img_width, img_height = img.size
    max_width_px, max_height_px = int(image_width / 9525), int(image_height / 9525)
    aspect = img_height / img_width
    if max_height_px / aspect <= max_width_px:
        final_height_px = max_height_px
        final_width_px = int(final_height_px / aspect)
    else:
        final_width_px = max_width_px
        final_height_px = int(final_width_px * aspect)
    img_top_offset = int((max_height_px - final_height_px) / 2)
    img_left_offset = int((max_width_px - final_width_px) / 2)
    slide.shapes.add_picture(
        img_buffer,
        image_left + Emu(img_left_offset * 9525),
        image_top + Emu(img_top_offset * 9525),
        Emu(final_width_px * 9525),
        Emu(final_height_px * 9525)
    )

async def build_presentation_with_charts(
    template_path: str,
    chart_figures_and_titles: List[Tuple[Any, str, List[str]]],
    output_path: str,
    insight_points: Optional[List[str]] = None
) -> None:
    prs = Presentation(template_path)
    # Removing all but the first slide 
    for idx in range(len(prs.slides) - 1, 0, -1):
        rId = prs.slides._sldIdLst[idx].rId
        prs.slides._sldIdLst.remove(prs.slides._sldIdLst[idx])
        prs.part.drop_rel(rId)
    # Adding chart slides
    for fig_obj, chart_title, chart_bullets in chart_figures_and_titles:
        add_custom_chart_slide(prs, fig_obj, chart_title, chart_bullets)
    # Insights slide
    slide_width, slide_height = prs.slide_width, prs.slide_height
    usable_width = slide_width * (1 - LEFT_MARGIN_RATIO - RIGHT_MARGIN_RATIO)
    usable_height = slide_height * (1 - TOP_MARGIN_RATIO - BOTTOM_MARGIN_RATIO)
    usable_left = slide_width * LEFT_MARGIN_RATIO
    usable_top = slide_height * TOP_MARGIN_RATIO
    slide = prs.slides.add_slide(prs.slide_layouts[6])
    title_width = slide_width * TITLE_WIDTH_RATIO
    title_left = (slide_width - title_width) / 2
    title_top = usable_top
    title_height = usable_height * TITLE_HEIGHT_RATIO
    title_box = slide.shapes.add_textbox(
        Emu(title_left), Emu(title_top), Emu(title_width), Emu(title_height)
    )
    tf = title_box.text_frame
    tf.text = "SUMMARY INSIGHTS"
    para = tf.paragraphs[0]
    para.alignment = PP_ALIGN.CENTER
    run = para.runs[0]
    run.font.size = TITLE_FONT_SIZE
    run.font.bold = True
    run.font.name = TITLE_FONT_NAME
    run.font.color.rgb = TITLE_FONT_COLOR
    content_top = usable_top + usable_height * TITLE_HEIGHT_RATIO
    content_height = usable_height * CONTENT_HEIGHT_RATIO
    insight_left = Emu(usable_left + usable_width * 0.15)
    insight_top = Emu(content_top)
    insight_width = Emu(usable_width * 0.75)
    insight_height = Emu(content_height)
    insight_box = slide.shapes.add_textbox(insight_left, insight_top, insight_width, insight_height)
    tf_bullets = insight_box.text_frame
    tf_bullets.word_wrap = True
    for idx, txt in enumerate(insight_points or []):
        p = tf_bullets.paragraphs[0] if idx == 0 else tf_bullets.add_paragraph()
        p.level = 0
        p.space_after = BULLET_SPACE_AFTER
        p.alignment = PP_ALIGN.LEFT
        bolden_values_paragraph(p, u"\u2022 " + txt)
    # Thank you slide
    slide = prs.slides.add_slide(prs.slide_layouts[6])
    thank_left = Emu(usable_left + usable_width * 0.15)
    thank_top = Emu(usable_top + usable_height * 0.35)
    thank_width = Emu(usable_width * 0.7)
    thank_height = Emu(usable_height * 0.3)
    thank_box = slide.shapes.add_textbox(thank_left, thank_top, thank_width, thank_height)
    tf = thank_box.text_frame
    tf.text = "THANK YOU"
    p = tf.paragraphs[0]
    p.alignment = PP_ALIGN.CENTER
    run = p.runs[0]
    run.font.size = Pt(54)
    run.font.bold = True
    run.font.name = TITLE_FONT_NAME
    run.font.color.rgb = RGBColor(128, 0, 128)
    tf.word_wrap = True
    prs.save(output_path)
    print(f"Presentation created at '{output_path}'")



In [None]:
# --- Main Execution ---

async def main_execution(
    active_sites: pd.DataFrame, df: pd.DataFrame,
    template_ppt_path: str = 'GT_TA.pptx',
    output_ppt_path: str = 'new_ppt.pptx'
):
    print("\n--- Starting chart generation ---\n")
    chart_figures_and_titles, conclusion_points = visualize_column_summary(active_sites, df)
    print("\n--- Finished chart generation ---\n")
    await build_presentation_with_charts(
        template_ppt_path,
        chart_figures_and_titles,
        output_ppt_path,
        conclusion_points
    )

if __name__ == "__main__":
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            asyncio.ensure_future(main_execution(active_sites, df))
            print("PowerPoint integration task scheduled on existing event loop.")
        else:
            loop.run_until_complete(main_execution(active_sites, df))
    except RuntimeError:
        asyncio.run(main_execution(active_sites, df))