# TTN遺伝子のイントロン同士の配列相同性をBLASTで評価する

**目的**
イントロン同士の配列を比較して、相同性を確認する<br>
相同性とスプライシングパターンの関連がないかを検証

In [15]:
# オリジナルモジュールのインポート
from lib.gbkparse import Seq_count
import matplotlib.pyplot as plt
import pandas as pd

from Bio import pairwise2
from Bio.pairwise2 import format_alignment

import subprocess
import glob

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## 準備

In [16]:
# ディレクトリのリセット
cmd1 = "rm -rf ../data/blast"
cmd2 = "mkdir -p ../data/blast/db/ ../data/blast/fasta/ ../data/blast/result/"
subprocess.call(cmd1.split())
subprocess.call(cmd2.split())

0

In [17]:
# クラスのインスタンス化
human = Seq_count()

# gbkファイルの読み込み
human.read_gbk('../data/gbk/human_ttn.gb')

# 遺伝子全長配列
seq = human.gDNA_seq()

デフォルト値として、最もエクソンの多いNM_001267550.2を設定


## BLASTの設定

In [18]:
# すべてのイントロン配列をfasta形式で保存
with open("../data/blast/fasta/ttn_intron.fasta","w") as f:
    for n,i in enumerate(human.intron_list()):
        f.write(f">intron{str(n+1).zfill(3)}\n")
        f.write("".join(seq[i[0]:i[1]]))
        f.write("\n")

# すべてのイントロン配列を個別にfasta形式で保存
for n,i in enumerate(human.intron_list()):
    with open(f"../data/blast/fasta/ttn_intron{str(n+1).zfill(3)}.fasta","w") as f:
        f.write(f">intron{str(n+1).zfill(3)}\n")
        f.write("".join(seq[i[0]:i[1]]))

In [19]:
# BLASTデータベースの作成
cmd = "makeblastdb -in ../data/blast/fasta/ttn_intron.fasta -dbtype nucl -parse_seqids -out ../data/blast/db/ttn_intron"
subprocess.run(cmd.split())



Building a new DB, current time: 11/14/2023 09:14:35
New DB name:   /home/du/GitHub/bio_chem/ttn_splicing/data/blast/db/ttn_intron
New DB title:  ../data/blast/fasta/ttn_intron.fasta
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 362 sequences in 0.13954 seconds.




CompletedProcess(args=['makeblastdb', '-in', '../data/blast/fasta/ttn_intron.fasta', '-dbtype', 'nucl', '-parse_seqids', '-out', '../data/blast/db/ttn_intron'], returncode=0)

## BLASTの実行

In [20]:
# すべてのイントロンに対してBLASTを実行
for i in range(human.intron_num()):
    fmt = 6
    cmd = f"blastn -query ../data/blast/fasta/ttn_intron{str(i+1).zfill(3)}.fasta -db ../data/blast/db/ttn_intron -out ../data/blast/result/ttn_intron{str(i+1).zfill(3)}.txt -outfmt {fmt}"
    subprocess.run(cmd.split())

## BLASTの結果の整理

In [21]:
# BLAST結果のファイルリスト
ls = glob.glob("../data/blast/result/ttn_intron*.txt")
ls.sort(key = lambda x: int(x.split("/")[-1].split(".")[0].split("n")[-1]))

In [22]:
# BLAST結果ファイルの読み込みと結合
df = pd.read_csv(ls[0], sep='\t', header=None)
for i in ls[1:]:
    tmp = pd.read_csv(i, sep='\t', header=None)
    df = pd.concat([df, tmp])
df.columns = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
df

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,intron001,intron001,100.000,2556,0,0,1,2556,1,2556,0.000000e+00,4721.0
1,intron001,intron294,85.666,293,40,2,1848,2139,1551,1260,1.010000e-84,307.0
2,intron001,intron127,84.444,315,44,4,1838,2151,2002,1692,3.640000e-84,305.0
3,intron001,intron049,81.699,306,46,8,1839,2141,791,493,2.240000e-66,246.0
0,intron002,intron002,100.000,2210,0,0,1,2210,1,2210,0.000000e+00,4082.0
...,...,...,...,...,...,...,...,...,...,...,...,...
0,intron358,intron358,100.000,124,0,0,1,124,1,124,9.690000e-63,230.0
0,intron359,intron359,100.000,740,0,0,1,740,1,740,0.000000e+00,1367.0
0,intron360,intron360,100.000,100,0,0,1,100,1,100,1.650000e-49,185.0
0,intron361,intron361,100.000,525,0,0,1,525,1,525,0.000000e+00,970.0


In [23]:
df.to_csv('../data/blast/blast.csv')