タクソノミーIDが見つからなかったレコード処理

In [None]:
import xml.etree.ElementTree as ET
import os

# ファイルパス（相対パスを使用）
aka_taxids_file = 'Taxids_not_found'
xml_file = 'Taxids_not_found.xml'
output_table_file = 'Taxid_lineage_output'
linking_table_file = 'Taxids_taxonomic_lineages_linking_table'
final_output_file = 'Updated_Taxids_taxonomic_lineages_final'
accession_numbers_file = 'AccessionNumbers_taxids_linking_table_final'
updated_accession_numbers_file = 'Updated_AccessionNumbers_taxids_linking_table_final'

# 現在のディレクトリを取得（スクリプトのディレクトリに依存しない）
current_dir = os.getcwd()

# 相対パスを絶対パスに変換
aka_taxids_file = os.path.join(current_dir, aka_taxids_file)
xml_file = os.path.join(current_dir, xml_file)
output_table_file = os.path.join(current_dir, output_table_file)
linking_table_file = os.path.join(current_dir, linking_table_file)
final_output_file = os.path.join(current_dir, final_output_file)
accession_numbers_file = os.path.join(current_dir, accession_numbers_file)
updated_accession_numbers_file = os.path.join(current_dir, updated_accession_numbers_file)

# ファイルの存在を確認
files_to_check = [
    aka_taxids_file, xml_file, linking_table_file, accession_numbers_file
]
for file in files_to_check:
    if not os.path.exists(file):
        print(f"Error: File '{file}' not found.")
        exit(1)

# AkaTaxIdsのリストをテキストファイルから読み込む
try:
    with open(aka_taxids_file, 'r', encoding='utf-8') as f:
        aka_tax_ids = {line.strip() for line in f}
except Exception as e:
    print(f"Error reading '{aka_taxids_file}':", e)
    exit(1)

# AkaTaxIdsとTaxIdsの関連を保持する辞書
aka_to_taxid_dict = {}

# タクソンのデータを解析してTaxid_lineage_output.txtに書き込む
try:
    with open(output_table_file, 'w', encoding='utf-8') as out_file:
        tree = ET.parse(xml_file)
        root = tree.getroot()

        for taxon in root.findall('Taxon'):
            tax_id = taxon.find('TaxId').text if taxon.find('TaxId') is not None else 'No TaxId found'
            scientific_name = taxon.find('ScientificName').text if taxon.find('ScientificName') is not None else 'No ScientificName found'
            
            # AkaTaxIdsを取得して辞書に追加
            aka_ids = {elem.text for elem in taxon.find('AkaTaxIds').findall('TaxId')} if taxon.find('AkaTaxIds') else set()
            for aka_id in aka_ids:
                aka_to_taxid_dict[aka_id] = tax_id

            # 一致するAkaTaxIdを探す
            if aka_tax_ids.intersection(aka_ids):
                lineage_ex = taxon.find('LineageEx')
                if lineage_ex is not None:
                    # LineageExから必要な階層情報のみを取得
                    lineage_dict = {tax.find('Rank').text.lower(): tax.find('ScientificName').text for tax in lineage_ex.findall('Taxon')}
                    required_ranks = ['genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'superkingdom']
                    filtered_lineage = [lineage_dict.get(rank, '') for rank in required_ranks]

                    # 出力のフォーマットを構成
                    output_line = f"{tax_id}\t{scientific_name}\t{'|'.join(filtered_lineage)}\n"
                    out_file.write(output_line)
except Exception as e:
    print(f"Error processing '{xml_file}':", e)
    exit(1)

# Taxid_lineage_output.txtからデータを辞書として読み込む
taxid_lineage_dict = {}
try:
    with open(output_table_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            tax_id = parts[0]
            taxid_lineage_dict[tax_id] = parts[1] + '\t' + parts[2]
except Exception as e:
    print(f"Error reading '{output_table_file}':", e)
    exit(1)

# 記録された更新のリストを保持
updated_aka_taxid_records = []
updated_accession_records = []

# Taxids_taxonomic_lineages_linking_tableのデータを処理し、該当するものを置き換える
try:
    with open(linking_table_file, 'r', encoding='utf-8') as f, \
         open(final_output_file, 'w', encoding='utf-8') as output:
        for line in f:
            parts = line.strip().split('\t')
            aka_tax_id = parts[0]
            
            # AkaTaxIdが一致する場合に対応するTaxIdを取得
            if aka_tax_id in aka_to_taxid_dict:
                tax_id = aka_to_taxid_dict[aka_tax_id]
                
                # TaxIdに対応するレコードをTaxid_lineage_outputから取得
                if tax_id in taxid_lineage_dict:
                    updated_line = taxid_lineage_dict[tax_id]
                    output.write(tax_id + '\t' + updated_line + '\n')
                    updated_aka_taxid_records.append(f"Updated line: AkaTaxId '{aka_tax_id}' -> TaxId '{tax_id}'")
                else:
                    output.write(line.strip() + '\n')
            else:
                output.write(line.strip() + '\n')
except Exception as e:
    print(f"Error processing '{linking_table_file}':", e)
    exit(1)

# AccessionNumbers_taxids_linking_table_finalを読み込み、AkaIDをTaxidIDに置き換える
try:
    with open(accession_numbers_file, 'r', encoding='utf-8') as f, \
         open(updated_accession_numbers_file, 'w', encoding='utf-8') as out_file:
        for line in f:
            parts = line.strip().split('\t')
            accession_number = parts[0]
            tax_id = parts[1]
            
            # TaxIdがAkaIDとして辞書にある場合、対応するTaxidIDに置き換える
            if tax_id in aka_to_taxid_dict:
                new_tax_id = aka_to_taxid_dict[tax_id]
                updated_accession_records.append(f"Updated TaxId for accession number '{accession_number}': {tax_id} -> {new_tax_id}")
                tax_id = new_tax_id
            
            # 出力フォーマットを構成
            out_file.write(f"{accession_number}\t{tax_id}\n")
except Exception as e:
    print(f"Error processing '{accession_numbers_file}':", e)
    exit(1)

print("Processing complete. Updated file has been created.")

# 結果の要約を一度だけ印刷する
print("\nUpdated AkaTaxId records:")
for record in updated_aka_taxid_records:
    print(record)

print("\nUpdated accession records:")
for record in updated_accession_records:
    print(record)




AccessionNumbers_taxids_Taxonomic_lineages_linking_tableの生成

In [None]:

import pandas as pd

# ファイルパスの指定
taxids_file = 'Updated_AccessionNumbers_taxids_linking_table_final'
taxonomic_lineages_file = 'Updated_Taxids_taxonomic_lineages_final'
output_file = 'AccessionNumbers_taxids_Taxonomic_lineages_linking_table'

# データのクレンジング関数
def clean_lineage_data(file_path):
    cleaned_data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                # 末尾の改行とスペースを取り除く
                line = line.strip()
                # 行をタブで分割
                parts = line.split('\t')
                # 3列目がある場合は削除して2列にする
                if len(parts) > 2:
                    parts = parts[:2]
                # 正しく分割されなかった場合の処理
                if len(parts) != 2:
                    print(f"Problematic line: {line}")
                cleaned_data.append(parts)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
    return pd.DataFrame(cleaned_data, columns=['taxid', 'lineage'])

# クレンジングされたタクソノミー系統情報の読み込み
taxonomic_lineages_df = clean_lineage_data(taxonomic_lineages_file)
taxonomic_lineages_df['taxid'] = taxonomic_lineages_df['taxid'].astype(str)  # taxid列を文字列型に変換
print(f"Taxonomic lineages:\n{taxonomic_lineages_df.head()}")

# アクセッション番号とtaxidsの読み込み
try:
    taxids_df = pd.read_csv(taxids_file, sep='\t', header=None, names=['accession_number', 'taxid'])
    taxids_df['taxid'] = taxids_df['taxid'].astype(str)  # taxid列を文字列型に変換
    
except Exception as e:
    print(f"Error reading {taxids_file}: {e}")
    exit(1)

# データのマージ
try:
    merged_df = pd.merge(taxids_df, taxonomic_lineages_df, on='taxid', how='left')
    
except Exception as e:
    print(f"Error merging data: {e}")
    exit(1)

# 出力ファイルに書き込み
try:
    merged_df.to_csv(output_file, sep='\t', index=False, header=False)
    print(f"Processing complete. Updated file has been created: {output_file}")
except Exception as e:
    print(f"Error writing to {output_file}: {e}")
    exit(1)

