# ヒトTTN遺伝子のイントロンの5'、3'側末端の塩基の保存性の評価

**目的**
イントロン同士の配列を比較して、相同性を確認する<br>
相同性とスプライシングパターンの関連がないかを検証

In [1]:
# オリジナルモジュールのインポート
from lib.gbkparse import Seq_count
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import subprocess
import glob

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## 準備

In [2]:
# クラスのインスタンス化
gbk = Seq_count()

# gbkファイルの読み込み
gbk.read_gbk('../data/gbk/human_ttn.gb')

デフォルト値として、最もエクソンの多いNM_001267550.2を設定


In [3]:
with open('../data/gbk/human_ttn.gb') as f:
    lines = f.readlines()
    for i in lines[:10]:
        for j in i.split():
            if "complement" in j:
                start, end = j.replace("complement(", "").replace(")", "").split("..")
start = int(start)
end = int(end)
print(start, end)

178525989 178807423


## UCSCのTbale BrowserからダウンロードしたphastCons100wayのデータの読み込み

In [12]:
# 3分割したファイルの読み込みと統合
ttn1 = pd.read_csv('../data/ucsc/ucsc_ttn1.txt', sep='\t', skiprows=9, nrows=100000, header=None)
ttn2 = pd.read_csv('../data/ucsc/ucsc_ttn2.txt', sep='\t', skiprows=9, nrows=100000, header=None)
ttn3 = pd.read_csv('../data/ucsc/ucsc_ttn3.txt', sep='\t', skiprows=9, nrows=100000, header=None)
ttn = pd.concat([ttn1, ttn2, ttn3])

# TTN遺伝子範囲のみ抽出
ucsc_ttn = ttn[(ttn[0] >= start) & (ttn[0] <= end)]
ucsc_ttn.columns = ['pos','cons' ]

# 読み込んだデータに一部欠損があるために、欠損箇所を埋める
tmp = pd.DataFrame({"pos": np.arange(178525989, 178807423)})
df = pd.merge(tmp, ucsc_ttn, left_on='pos', right_on='pos', how='left')

## 各イントロンとエクソンの塩基ごとの保存性の平均値を求める

In [5]:
#　TTNのmRNAは逆鎖にあるため、反転させる
df = df.sort_values('pos', ascending=False)
df['pos'] = df['pos'].apply(lambda x: 178807422-x)
df = df.set_index('pos', drop=True)

# エクソンのごとの保存性の平均値を求める
exon_cons_ave = {}
for i in range(gbk.exon_num()):
    r = gbk.exon_list()[i]
    ave = round(df.iloc[r[0]:r[1], 0].mean(),2)
    exon_cons_ave[i+1] = ave

# イントロンのごとの保存性の平均値を求める
intron_cons_ave = {}
for i in range(gbk.intron_num()):
    r = gbk.intron_list()[i]
    ave = round(df.iloc[r[0]:r[1], 0].mean(),2)
    intron_cons_ave[i+1] = ave

In [14]:
# 上記のデータの可視化
fig = make_subplots(rows=2, cols=1, subplot_titles=("各エクソンの塩基の保存性の平均", "各イントロンの塩基の保存性の平均"))
fig.add_trace(go.Bar(x=list(exon_cons_ave.keys()), y=list(exon_cons_ave.values())), row=1, col=1)
fig.add_trace(go.Bar(x=list(intron_cons_ave.keys()), y=list(intron_cons_ave.values())), row=2, col=1)
fig.update_yaxes(range=[0, 1], row=1, col=1)
fig.update_yaxes(range=[0, 1], row=2, col=1)
fig.update_layout(height=500, width=1200, title_text="塩基の保存性の平均値")
fig.show()

## 各イントロン末端からnum塩基の各塩基の保存性を可視化

In [7]:
# 検討するイントロン末端からの塩基数
num = 80

# 5'側のイントロンn塩基の保存性の平均
left_intron_cons = {} 
for n, i in enumerate(gbk.intron_list()):
    left_intron_cons[n+1] = list(df.iloc[i[0]:i[0]+num,0])
left_intron_cons_ave_df = pd.DataFrame.from_dict(left_intron_cons).T
left_intron_cons_ave_df.columns = [i for i in range(1, num+1)]
left_intron_cons_ave_df = left_intron_cons_ave_df.apply(lambda x: round(x,2))
# 5'側のイントロンn塩基の保存性の平均
right_intron_cons = {} 
for n, i in enumerate(gbk.intron_list()):
    right_intron_cons[n+1] = list(df.iloc[i[1]-num:i[1],0])
right_intron_cons_ave_df = pd.DataFrame.from_dict(right_intron_cons).T
right_intron_cons_ave_df.columns = [-i for i in range(num, 0, -1)]
right_intron_cons_ave_df.apply(lambda x: round(x,2))

Unnamed: 0,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,...,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1
1,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,...,0.00,0.00,0.05,0.99,1.00,0.01,1.00,1.00,1.00,1.00
2,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.01,0.00,...,0.00,0.00,0.00,0.02,0.01,0.00,0.99,1.00,1.00,1.00
3,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,...,0.01,0.20,0.19,0.69,0.80,0.81,1.00,1.00,1.00,1.00
4,0.40,0.02,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,...,0.92,0.91,0.06,0.05,0.00,0.00,0.15,1.00,1.00,1.00
5,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,...,0.05,0.02,0.03,0.99,1.00,0.04,0.98,1.00,1.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,0.00,0.00,0.00,0.00,0.10,0.19,0.2,0.19,0.11,0.10,...,0.20,0.11,0.12,0.12,0.11,0.12,0.10,1.00,1.00,1.00
359,0.00,0.24,0.20,0.12,0.09,0.01,0.0,0.00,0.00,0.00,...,1.00,0.98,0.98,0.98,0.98,1.00,1.00,0.99,0.99,0.92
360,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.11,0.08,...,0.01,0.01,0.00,0.91,0.96,0.89,0.03,0.96,1.00,1.00
361,0.00,0.00,0.45,0.36,0.09,0.04,0.0,0.00,0.00,0.04,...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00


In [15]:
# 上記データの可視化
fig = make_subplots(rows=1, cols=2, subplot_titles=(f"各イントロンの5'側の{num}塩基の保存性", f"各イントロンの3'側の{num}塩基の保存性"))
left_hovertxt = 'Intron: %{y}<br>Base: %{x}<br>Conservation: %{z}<extra></extra>'
right_hovertxt = 'X: %{x}<br>Y: %{y}<br>Z: %{z}<extra></extra>'
fig.add_trace(go.Heatmap(x=left_intron_cons_ave_df.columns, y=left_intron_cons_ave_df.index, z=left_intron_cons_ave_df.values, colorscale='Viridis', hovertemplate=left_hovertxt), row=1, col=1)
fig.add_trace(go.Heatmap(x=right_intron_cons_ave_df.columns, y=right_intron_cons_ave_df.index, z=right_intron_cons_ave_df.values, colorscale='Viridis', hovertemplate=right_hovertxt), row=1, col=2)
fig.update_yaxes(autorange="reversed")
fig.update_layout(height=750, width=1200, title_text="サブプロットの例")
fig.show()