In [None]:
import pandas as pd 
from scipy import stats
import numpy as np
from io import StringIO
from matplotlib import pyplot as plt

In [None]:
SAMTOOLS_CMD = 'samtools'

In [None]:
reads = pd.read_csv('../demux/rep1/identifiers-reporter.txt', sep='\t', index_col=0)
reads.head()

In [None]:
ret = !$SAMTOOLS_CMD view ../polya/rep1/reporter.bam |cut -f1,27| sed -e 's/pt:i://g' | sort | uniq
polya_len = pd.read_csv(StringIO('\n'.join(ret)), sep='\t', names=['read_id', 'polya_len'])
polya_len = polya_len[polya_len['polya_len'] >= 5].set_index('read_id') # Minimum poly(A) length
polya_len.head()

In [None]:
reads_wpa = pd.merge(reads, polya_len, left_index=True, right_index=True)
reads_wpa.head()

In [None]:
reads_wpa.groupby(['sample', 'rnaname']).count()['barcode_confidence'].unstack()

In [None]:
def get_polya_dist(sample, rna):
    seltbl = reads_wpa[(reads_wpa['rnaname'] == rna) & (reads_wpa['sample'] == sample)]
    return seltbl['polya_len'].tolist()

# Create 6 different distributions
data1_left = get_polya_dist('untreated', 'control')
data1_right = get_polya_dist('rg7834', 'control')

data2_left = get_polya_dist('untreated', 'A2')
data2_right = get_polya_dist('rg7834', 'A2')

data3_left = get_polya_dist('untreated', 'A7')
data3_right = get_polya_dist('rg7834', 'A7')

# Group the data
left_data = [data1_left, data2_left, data3_left]
right_data = [data1_right, data2_right, data3_right]

In [None]:
def split_violin_plot(ax, data_left, data_right, position, width=0.8, bw_method=0.25, proportional=True):
    """
    Create a split violin plot at given position with area proportional to sample size
    """
    # Calculate kernel density estimation for both datasets
    kde_left = stats.gaussian_kde(data_left, bw_method=bw_method)
    kde_right = stats.gaussian_kde(data_right, bw_method=bw_method)
    
    # Create y-axis range based on combined data
    y_min = min(np.min(data_left), np.min(data_right))
    y_max = max(np.max(data_left), np.max(data_right))
    y_range = np.linspace(y_min, y_max, 200)
    
    # Calculate densities
    density_left = kde_left(y_range)
    density_right = kde_right(y_range)
    
    # Calculate scaling factors based on sample counts
    # Use square root of count ratio for area proportionality
    count_left = len(data_left)
    count_right = len(data_right)
    max_count = max(count_left, count_right)

    if proportional:
        scale_left = count_left / max_count
        scale_right = count_right / max_count
        if isinstance(proportional, int):
            scale_left *= proportional
            scale_right *= proportional
    else:
        scale_left = 1
        scale_right = 1
    
    # Normalize densities to fit within violin width
    max_density = max(np.max(density_left), np.max(density_right))
    density_left = density_left / max_density * width / 2 * scale_left
    density_right = density_right / max_density * width / 2 * scale_right

    # Create x coordinates for left and right halves
    x_left = position - density_left
    x_right = position + density_right
    
    # Plot left half (dark blue)
    ax.fill_betweenx(y_range, position, x_left, alpha=1, color='#7f7f7f', edgecolor='none', linewidth=0)
    
    # Plot right half (red)
    ax.fill_betweenx(y_range, position, x_right, alpha=1, color='#a81f24', edgecolor='none', linewidth=0)
    
    # Add median lines
    median_left = np.median(data_left)
    median_right = np.median(data_right)

    # Left median line (scale length with violin width)
    left_line_length = width / 8
    ax.plot([position - left_line_length, position], [median_left, median_left], 
            color='white', linewidth=1.5)
    
    # Right median line (scale length with violin width)
    right_line_length = width / 8
    ax.plot([position, position + right_line_length], [median_right, median_right],
            color='white', linewidth=1.5)

In [None]:
# Create the figure and axis
fig, ax = plt.subplots(figsize=(4, 2.5))

# Define positions for the three violins
positions = [1, 2, 3]

# Create split violin plots
for i, (pos, data_l, data_r) in enumerate(zip(positions, left_data, right_data)):
    split_violin_plot(ax, data_l, data_r, pos, bw_method=0.3,proportional=2)

# Customize the plot
ax.set_xticks(positions)
ax.set_xticklabels(['Control', 'A2', 'A7'])
ax.set_ylabel('Tail length (nt)')
ax.grid(True, alpha=0.3)

# Add center line for each violin
#for pos in positions:
#    ax.axvline(x=pos, color='#888888', alpha=1, linewidth=1)

# Set axis limits with some padding
all_data = np.concatenate(left_data + right_data)
y_min, y_max = np.min(all_data), np.max(all_data)
y_padding = (y_max - y_min) * 0.1
#ax.set_ylim(y_min - y_padding, y_max + y_padding)
ax.set_xlim(0.2, 3.5)
ax.set_yticks(np.arange(0, 301, 50))
ax.set_ylim(0, 300)

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#7f7f7f', alpha=1, label='Untreated'),
                   Patch(facecolor='#a81f24', alpha=1, label='RG7834')]
ax.legend(handles=legend_elements, loc='upper left', fontsize=10)

# Improve aesthetics
ax.spines['left'].set_position(('outward', 5))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
plt.setp(ax.xaxis.get_ticklines(), visible=False)

plt.tight_layout()
plt.savefig('rep1-violin-proportional.pdf')

In [None]:
# Create the figure and axis
fig, ax = plt.subplots(figsize=(4, 2.5))

# Define positions for the three violins
positions = [1, 2, 3]

# Create split violin plots
for i, (pos, data_l, data_r) in enumerate(zip(positions, left_data, right_data)):
    split_violin_plot(ax, data_l, data_r, pos, width=1.2, bw_method=0.3, proportional=False)

# Customize the plot
ax.set_xticks(positions)
ax.set_xticklabels(['Control', 'A2', 'A7'])
ax.set_ylabel('Tail length (nt)')
ax.grid(True, alpha=0.3)

# Add center line for each violin
#for pos in positions:
#    ax.axvline(x=pos, color='black', alpha=0.3, linewidth=1)

# Set axis limits with some padding
all_data = np.concatenate(left_data + right_data)
y_min, y_max = np.min(all_data), np.max(all_data)
y_padding = (y_max - y_min) * 0.1
#ax.set_ylim(y_min - y_padding, y_max + y_padding)
ax.set_xlim(0.5, 3.5)
ax.set_yticks(np.arange(0, 301, 50))
ax.set_ylim(0, 300)

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#7f7f7f', alpha=1, label='Untreated'),
                   Patch(facecolor='#a81f24', alpha=1, label='RG7834')]
ax.legend(handles=legend_elements, loc='upper left', fontsize=10)

# Improve aesthetics
ax.spines['left'].set_position(('outward', 5))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
plt.setp(ax.xaxis.get_ticklines(), visible=False)

plt.tight_layout()
plt.savefig('rep1-violin-uniform.pdf')