### pLI and LOEUF Category Distribution by Variant Type

This analysis visualizes the distribution of **pLI** (probability of loss-of-function intolerance) and **LOEUF** (loss-of-function observed/expected upper bound fraction) categories across different **variant groups**.  

**Key steps in this cell:**
1. **Data Loading & Cleaning**
   - Reads `pli_AD_genes.csv` (update path if necessary).
   - Replaces `"SNV"` labels with `"Nonsense"` for consistency.
   - Removes rows where `pli_cat` or `loefu_cat` contain `"NA"` values.

2. **Group and Color Mapping**
   - Defines six variant groups (`Minus1`, `Plus1`, `Nonsense` and their respective controls).
   - Assigns consistent colors and display labels for plotting.

3. **Percentage Distribution Calculation**
   - Computes within-group percentage distributions of `pLI` and `LOEUF` categories (`Low`, `Medium`, `High`).

4. **Visualization**
   - Generates two vertically stacked bar charts:
     - **Top:** pLI category distribution
     - **Bottom:** LOEUF category distribution
   - Bars are grouped by variant category with percentage labels displayed on bars.
   - A single shared legend is positioned on the **lower-left** of the figure.

5. **Output**
   - Saves the figure as `pLI_LOEUF_category_bar_chart.png` with improved font sizes, spacing, and visual clarity.

This visualization allows quick comparison of **constraint intolerance patterns** (via pLI and LOEUF) among experimental vs. control variant groups.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load the data
df = pd.read_csv('pli_AD_genes.csv')   #Adjust path to new data

# Rename SNV to Nonsense
df['category'] = df['category'].str.replace('SNV', 'Nonsense')

# Remove any NA-like values
df_clean = df[(df['pli_cat'] != 'NA') & (df['loefu_cat'] != 'NA')].copy()

# Variant order and colors
category_order = ['minus1', 'minus1_Control', 'plus1', 'plus1_Control', 'Nonsense', 'Nonsense_Control']
category_colors = ['#4472C4', '#B4C7E7', '#C65911', '#F4B183', '#70AD47', '#C5E0B4']
color_mapping = dict(zip(category_order, category_colors))

# Legend display labels (Title Case & cleaner symbols)
display_labels = {
    'minus1': 'Minus 1',
    'minus1_Control': 'Minus 1 Control',
    'plus1': 'Plus 1',
    'plus1_Control': 'PLus 1 Control',
    'Nonsense': 'Nonsense',
    'Nonsense_Control': 'Nonsense Control'
}

# --- Compute percentage distributions ---
pli_distributions, loefu_distributions = [], []

for category in category_order:
    if category in df_clean['category'].values:
        subset = df_clean[df_clean['category'] == category]
        pli_counts = subset['pli_cat'].value_counts(normalize=True) * 100
        loeuf_counts = subset['loefu_cat'].value_counts(normalize=True) * 100

        pli_distributions.append({
            'Category': category,
            'Low': pli_counts.get('Low', 0),
            'Medium': pli_counts.get('Medium', 0),
            'High': pli_counts.get('High', 0)
        })
        loefu_distributions.append({
            'Category': category,
            'Low': loeuf_counts.get('Low', 0),
            'Medium': loeuf_counts.get('Medium', 0),
            'High': loeuf_counts.get('High', 0)
        })

pli_df = pd.DataFrame(pli_distributions).set_index('Category')
loefu_df = pd.DataFrame(loefu_distributions).set_index('Category')

# --- Figure / Axes (larger) ---
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(13.5, 10.5))

# ----- GLOBAL FONT UPGRADE -----
plt.rcParams.update({
    "font.size": 14,         # base font size
    "axes.titlesize": 20,    # subplot titles (pLI, LOEUF)
    "axes.labelsize": 18,    # axis labels
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    "legend.fontsize": 14,
})


# pLI bars
x_labels = ['Low\n(pLI ≤ 0.35)', 'Medium\n(0.35 < pLI < 0.66)', 'High\n(pLI ≥ 0.66)']
x = np.arange(len(x_labels))
width = 0.14  # slightly wider bars to "enlarge" the bar graphs

for i, category in enumerate(category_order):
    if category in pli_df.index:
        vals = [pli_df.loc[category, 'Low'], pli_df.loc[category, 'Medium'], pli_df.loc[category, 'High']]
        offset = (i - (len(category_order)-1)/2) * width
        bars = ax1.bar(x + offset, vals, width,
                       label=display_labels[category],
                       color=color_mapping[category],
                       alpha=0.9, edgecolor='black', linewidth=0.6)
        # value labels
        for b, v in zip(bars, vals):
            if v > 2:
                ax1.annotate(f'{v:.0f}%', (b.get_x() + b.get_width()/2, b.get_height()),
                             xytext=(0, 4), textcoords="offset points",
                             ha='center', va='bottom', fontsize=11, fontweight='bold')

ax1.set_xlabel('pLI Category', fontweight='bold', fontsize=16)
ax1.set_ylabel('Percentage (%)', fontweight='bold', fontsize=16)
ax1.set_title('pLI Category Distribution', fontweight='bold', fontsize=18)
ax1.set_xticks(x)
ax1.set_xticklabels(x_labels, fontsize=13)
ax1.tick_params(axis='y', labelsize=13)
ax1.grid(True, alpha=0.3, axis='y')
ax1.set_ylim(0, 100)

# LOEUF bars
x_labels_loefu = ['Low\n(LOEUF ≤ 0.2)', 'Medium\n(0.2 < LOEUF ≤ 0.6)', 'High\n(LOEUF > 0.6)']

for i, category in enumerate(category_order):
    if category in loefu_df.index:
        vals = [loefu_df.loc[category, 'Low'], loefu_df.loc[category, 'Medium'], loefu_df.loc[category, 'High']]
        offset = (i - (len(category_order)-1)/2) * width
        bars = ax2.bar(x + offset, vals, width,
                       label=display_labels[category],
                       color=color_mapping[category],
                       alpha=0.9, edgecolor='black', linewidth=0.6)
        for b, v in zip(bars, vals):
            if v > 2:
                ax2.annotate(f'{v:.0f}%', (b.get_x() + b.get_width()/2, b.get_height()),
                             xytext=(0, 4), textcoords="offset points",
                             ha='center', va='bottom', fontsize=11, fontweight='bold')

ax2.set_xlabel('LOEUF Category', fontweight='bold', fontsize=16)
ax2.set_ylabel('Percentage (%)', fontweight='bold', fontsize=16)
ax2.set_title('LOEUF Category Distribution', fontweight='bold', fontsize=18)
ax2.set_xticks(x)
ax2.set_xticklabels(x_labels_loefu, fontsize=13)
ax2.tick_params(axis='y', labelsize=13)
ax2.grid(True, alpha=0.3, axis='y')
ax2.set_ylim(0, 100)

# ----- ONE LEGEND ON LEFT -----
handles, labels = ax1.get_legend_handles_labels()

# capitalize first letter
labels = [lbl.capitalize() for lbl in labels]

legend = fig.legend(
    handles, labels,
    title="Variant Type",
    loc='lower left',
    bbox_to_anchor=(-0.12, 0.20),   # Adjust positioning further if needed
    borderaxespad=0.0,
    frameon=False,
)

legend.set_title("Variant Type", prop={'size': 18, 'weight': 'bold'})


# Layout adjustments to keep spacing clean
plt.tight_layout(rect=[0.18, 0.04, 1, 1])
plt.subplots_adjust(left=0.22)


plt.savefig('pLI_LOEUF_category_bar_chart.png',
            dpi=300, bbox_inches='tight', facecolor='white')
plt.show()