In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 读取文件
file_path = "021.filtered_no_AFDB_sqanti3_classifcation_90928_renameSymbo_riboORF.tsv"
df = pd.read_csv(file_path, sep='\t')

# 创建ENST和TALONT的掩码
enst_mask = df['isoform_id'].str.startswith('ENST', na=False)
talont_mask = df['isoform_id'].str.startswith('TALONT', na=False)

# 统计各类型数据
def get_counts(mask):
    subset = df[mask]
    has_orf = subset['mean_ORF_count'].notna() & (subset['mean_ORF_count'] != 0)
    no_orf = subset['mean_ORF_count'].isna() | (subset['mean_ORF_count'] == 0)
    return {
        'total': len(subset),
        'has_orf': sum(has_orf),
        'no_orf': sum(no_orf)
    }

enst_counts = get_counts(enst_mask)
talont_counts = get_counts(talont_mask)

# 计算比例
enst_props = {
    'has_orf': enst_counts['has_orf'] / enst_counts['total'] * 100,
    'no_orf': enst_counts['no_orf'] / enst_counts['total'] * 100
}

talont_props = {
    'has_orf': talont_counts['has_orf'] / talont_counts['total'] * 100,
    'no_orf': talont_counts['no_orf'] / talont_counts['total'] * 100
}

# 创建堆叠柱状图
labels = ['ENST', 'TALONT']
has_orf_props = [enst_props['has_orf'], talont_props['has_orf']]
no_orf_props = [enst_props['no_orf'], talont_props['no_orf']]

plt.figure(figsize=(10, 6))

# 绘制堆叠柱状图
bars1 = plt.bar(labels, has_orf_props, label='Has ORF', color='#66b3ff')
bars2 = plt.bar(labels, no_orf_props, bottom=has_orf_props, label='No ORF', color='#ff9999')

# 添加标题和标签
plt.title('Distribution of ORF Presence by Transcript Type', fontsize=14)
plt.xlabel('Transcript Type', fontsize=12)
plt.ylabel('Percentage (%)', fontsize=12)

# 添加图例
plt.legend()

# 添加数值标签
for i in range(len(labels)):
    # Has ORF labels
    plt.text(i, has_orf_props[i]/2, 
            f'{[enst_counts["has_orf"], talont_counts["has_orf"]][i]}\n({has_orf_props[i]:.1f}%)',
            ha='center', va='center', color='white', fontweight='bold')
    
    # No ORF labels
    plt.text(i, has_orf_props[i] + no_orf_props[i]/2,
            f'{[enst_counts["no_orf"], talont_counts["no_orf"]][i]}\n({no_orf_props[i]:.1f}%)',
            ha='center', va='center', color='white', fontweight='bold')

# 打印详细统计信息
print("\nENST转录本统计:")
print(f"总数: {enst_counts['total']}")
print(f"有ORF: {enst_counts['has_orf']} ({enst_props['has_orf']:.2f}%)")
print(f"无ORF: {enst_counts['no_orf']} ({enst_props['no_orf']:.2f}%)")

print("\nTALONT转录本统计:")
print(f"总数: {talont_counts['total']}")
print(f"有ORF: {talont_counts['has_orf']} ({talont_props['has_orf']:.2f}%)")
print(f"无ORF: {talont_counts['no_orf']} ({talont_props['no_orf']:.2f}%)")

plt.tight_layout()
plt.savefig('orf_distribution.png', dpi=300, bbox_inches='tight')
plt.close()


ENST转录本统计:
总数: 24177
有ORF: 14496 (59.96%)
无ORF: 9681 (40.04%)

TALONT转录本统计:
总数: 66752
有ORF: 33271 (49.84%)
无ORF: 33481 (50.16%)


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 读取文件
file_path = "021.filtered_no_AFDB_sqanti3_classifcation_90928_renameSymbo_riboORF.tsv"
df = pd.read_csv(file_path, sep='\t')

# 创建ENST和TALONT的掩码
enst_mask = df['isoform_id'].str.startswith('ENST', na=False)
talont_mask = df['isoform_id'].str.startswith('TALONT', na=False)

# 统计各类型数据
def get_counts(mask):
    subset = df[mask]
    has_orf = subset['mean_ORF_count'].notna() & (subset['mean_ORF_count'] != 0)
    no_orf = subset['mean_ORF_count'].isna() | (subset['mean_ORF_count'] == 0)
    return {
        'total': len(subset),
        'has_orf': sum(has_orf),
        'no_orf': sum(no_orf)
    }

def get_detect_orf_ribo(mask):
    subset = df[mask]
    true_count = subset['detect_ORF_ribo'].sum()
    false_count = len(subset) - true_count
    return {
        'total': len(subset),
        'true': true_count,
        'false': false_count
    }

# ORF统计
enst_counts = get_counts(enst_mask)
talont_counts = get_counts(talont_mask)

# detect_ORF_ribo统计
enst_detect_counts = get_detect_orf_ribo(enst_mask)
talont_detect_counts = get_detect_orf_ribo(talont_mask)

# 计算比例
enst_props = {
    'has_orf': enst_counts['has_orf'] / enst_counts['total'] * 100,
    'no_orf': enst_counts['no_orf'] / enst_counts['total'] * 100
}

talont_props = {
    'has_orf': talont_counts['has_orf'] / talont_counts['total'] * 100,
    'no_orf': talont_counts['no_orf'] / talont_counts['total'] * 100
}

enst_detect_props = {
    'true': enst_detect_counts['true'] / enst_detect_counts['total'] * 100,
    'false': enst_detect_counts['false'] / enst_detect_counts['total'] * 100
}

talont_detect_props = {
    'true': talont_detect_counts['true'] / talont_detect_counts['total'] * 100,
    'false': talont_detect_counts['false'] / talont_detect_counts['total'] * 100
}

# 创建堆叠柱状图
labels = ['ENST', 'TALONT']

# 第一张图: mean_ORF_count
has_orf_props = [enst_props['has_orf'], talont_props['has_orf']]
no_orf_props = [enst_props['no_orf'], talont_props['no_orf']]

plt.figure(figsize=(10, 6))

# 绘制堆叠柱状图
bars1 = plt.bar(labels, has_orf_props, label='Has ORF', color='#66b3ff')
bars2 = plt.bar(labels, no_orf_props, bottom=has_orf_props, label='No ORF', color='#ff9999')

# 添加标题和标签
plt.title('Distribution of ORF Presence by Transcript Type', fontsize=14)
plt.xlabel('Transcript Type', fontsize=12)
plt.ylabel('Percentage (%)', fontsize=12)

# 添加图例
plt.legend()

# 添加数值标签
for i in range(len(labels)):
    # Has ORF labels
    plt.text(i, has_orf_props[i]/2, 
            f'{[enst_counts["has_orf"], talont_counts["has_orf"]][i]}\n({has_orf_props[i]:.1f}%)',
            ha='center', va='center', color='white', fontweight='bold')
    
    # No ORF labels
    plt.text(i, has_orf_props[i] + no_orf_props[i]/2,
            f'{[enst_counts["no_orf"], talont_counts["no_orf"]][i]}\n({no_orf_props[i]:.1f}%)',
            ha='center', va='center', color='white', fontweight='bold')

plt.tight_layout()
plt.savefig('orf_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

# 第二张图: detect_ORF_ribo
true_props = [enst_detect_props['true'], talont_detect_props['true']]
false_props = [enst_detect_props['false'], talont_detect_props['false']]

plt.figure(figsize=(10, 6))

# 绘制堆叠柱状图
bars1 = plt.bar(labels, true_props, label='True', color='#66b3ff')
bars2 = plt.bar(labels, false_props, bottom=true_props, label='False', color='#ff9999')

# 添加标题和标签
plt.title('Distribution of detect_ORF_ribo by Transcript Type', fontsize=14)
plt.xlabel('Transcript Type', fontsize=12)
plt.ylabel('Percentage (%)', fontsize=12)

# 添加图例
plt.legend()

# 添加数值标签
for i in range(len(labels)):
    # True labels
    plt.text(i, true_props[i]/2, 
            f'{[enst_detect_counts["true"], talont_detect_counts["true"]][i]}\n({true_props[i]:.1f}%)',
            ha='center', va='center', color='white', fontweight='bold')
    
    # False labels
    plt.text(i, true_props[i] + false_props[i]/2,
            f'{[enst_detect_counts["false"], talont_detect_counts["false"]][i]}\n({false_props[i]:.1f}%)',
            ha='center', va='center', color='white', fontweight='bold')

plt.tight_layout()
plt.savefig('detect_orf_ribo_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

# 打印详细统计信息
print("\nENST转录本统计:")
print(f"总数: {enst_counts['total']}")
print(f"有ORF: {enst_counts['has_orf']} ({enst_props['has_orf']:.2f}%)")
print(f"无ORF: {enst_counts['no_orf']} ({enst_props['no_orf']:.2f}%)")

print("\nTALONT转录本统计:")
print(f"总数: {talont_counts['total']}")
print(f"有ORF: {talont_counts['has_orf']} ({talont_props['has_orf']:.2f}%)")
print(f"无ORF: {talont_counts['no_orf']} ({talont_props['no_orf']:.2f}%)")

print("\nENST detect_ORF_ribo统计:")
print(f"总数: {enst_detect_counts['total']}")
print(f"True: {enst_detect_counts['true']} ({enst_detect_props['true']:.2f}%)")
print(f"False: {enst_detect_counts['false']} ({enst_detect_props['false']:.2f}%)")

print("\nTALONT detect_ORF_ribo统计:")
print(f"总数: {talont_detect_counts['total']}")
print(f"True: {talont_detect_counts['true']} ({talont_detect_props['true']:.2f}%)")
print(f"False: {talont_detect_counts['false']} ({talont_detect_props['false']:.2f}%)")



ENST转录本统计:
总数: 24177
有ORF: 14496 (59.96%)
无ORF: 9681 (40.04%)

TALONT转录本统计:
总数: 66752
有ORF: 33271 (49.84%)
无ORF: 33481 (50.16%)

ENST detect_ORF_ribo统计:
总数: 24177
True: 20025 (82.83%)
False: 4152 (17.17%)

TALONT detect_ORF_ribo统计:
总数: 66752
True: 59024 (88.42%)
False: 7728 (11.58%)


## coorelation of ORF mean count and tumor/TNBC exp

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 读取数据
df = pd.read_csv('014.3DisoGalaxy_nodes_uniprotKB_pfam_proteinVec_gene_symbol_with_rep_ConsisIndex_diffExp_ORFcount_fullINFO.tsv', sep='\t')

# 设置图形风格和大小
plt.style.use('seaborn')
fig, axes = plt.subplots(2, 2, figsize=(15, 15))

# 1. mean_ORF_count vs Normal_mean_log2tmm
sns.scatterplot(data=df, 
                x='mean_ORF_count', 
                y='Normal_mean_log2tmm',
                ax=axes[0,0])
axes[0,0].set_title('Correlation: ORF Count vs Normal Expression')
axes[0,0].set_xlabel('Mean ORF Count')
axes[0,0].set_ylabel('Normal Mean log2TMM')
# 添加相关系数
correlation = df['mean_ORF_count'].corr(df['Normal_mean_log2tmm'])
axes[0,0].text(0.05, 0.95, f'r = {correlation:.3f}', 
               transform=axes[0,0].transAxes)

# 2. mean_ORF_count vs Tumor_mean_log2tmm
sns.scatterplot(data=df, 
                x='mean_ORF_count', 
                y='Tumor_mean_log2tmm',
                ax=axes[0,1])
axes[0,1].set_title('Correlation: ORF Count vs Tumor Expression')
axes[0,1].set_xlabel('Mean ORF Count')
axes[0,1].set_ylabel('Tumor Mean log2TMM')
correlation = df['mean_ORF_count'].corr(df['Tumor_mean_log2tmm'])
axes[0,1].text(0.05, 0.95, f'r = {correlation:.3f}', 
               transform=axes[0,1].transAxes)

# 3. mean_ORF_count vs TNBC_mean_log2tmm
sns.scatterplot(data=df, 
                x='mean_ORF_count', 
                y='TNBC_mean_log2tmm',
                ax=axes[1,0])
axes[1,0].set_title('Correlation: ORF Count vs TNBC Expression')
axes[1,0].set_xlabel('Mean ORF Count')
axes[1,0].set_ylabel('TNBC Mean log2TMM')
correlation = df['mean_ORF_count'].corr(df['TNBC_mean_log2tmm'])
axes[1,0].text(0.05, 0.95, f'r = {correlation:.3f}', 
               transform=axes[1,0].transAxes)

# 4. 添加所有三种条件的对比图
sns.scatterplot(data=df, 
                x='mean_ORF_count', 
                y='Normal_mean_log2tmm', 
                label='Normal',
                ax=axes[1,1])
sns.scatterplot(data=df, 
                x='mean_ORF_count', 
                y='Tumor_mean_log2tmm', 
                label='Tumor',
                ax=axes[1,1])
sns.scatterplot(data=df, 
                x='mean_ORF_count', 
                y='TNBC_mean_log2tmm', 
                label='TNBC',
                ax=axes[1,1])
axes[1,1].set_title('Combined Correlation Plot')
axes[1,1].set_xlabel('Mean ORF Count')
axes[1,1].set_ylabel('Mean log2TMM')
axes[1,1].legend()

# 调整子图之间的间距
plt.tight_layout()

# 保存图片
plt.savefig('correlation_analysis.png', dpi=300, bbox_inches='tight')
plt.close()

  plt.style.use('seaborn')
