In [5]:
import glob
import pandas as pd
import re
import pyarrow.parquet as pq
import zstandard

from sklearn.model_selection import train_test_split

## csvsディレクトリ直下のcsvファイルを8:2に分割してfasta形式の検証データに整形

In [29]:
families = list(map(lambda file_name:  file_name.replace('./Data/csvs/', '').replace('.csv', ''), glob.glob("./Data/csvs/*")))
families

['tRNA',
 'SpF59_sRNA',
 'tmRNA',
 'Dicty_Class_I_RNA',
 'IRES_Picorna',
 '5_8S_rRNA',
 'Y_RNA',
 '5S_rRNA',
 'alpha_tmRNA']

In [39]:
for family in families:
    df = pd.read_csv(f'./Data/csvs/{family}.csv', encoding = 'UTF-8')
    df['SerialNum'] = '>' + df['id'].astype(str)
    df = df.loc[:,['SerialNum','sequence']]
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)
    df_train.to_csv(f'./Data/fastas/{family}_train.fasta', sep = '\n', index = None,  header = None)
    df_test.to_csv(f'./Data/fastas/{family}_test.fasta', sep = '\n', index = None,  header = None)

## RT988_demoをテストデータに変換

In [3]:
df = pd.read_csv('./Data/csvs/RT988_demo.csv', encoding = 'UTF-8')
df = df.sample(n=1250, random_state=0)
df['SerialNum'] = '>' + df['id'].astype(str)
df = df.loc[:,['SerialNum','sequence']]
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)
df_train.to_csv('./Data/fastas/RT988_demo_train.fasta', sep = '\n', index = None,  header = None)
df_test.to_csv('./Data/fastas/RT988_demo_test.fasta', sep = '\n', index = None,  header = None)

In [16]:
files = [
    'Rfam_100_200',
    'Rfam_100',
    'Rfam_200_300',
    'Rfam_300_400',
    'Rfam_400_1600',
]

for file in files:
    df = pd.read_csv(f'./Data/csvs/{file}.csv', encoding = 'UTF-8')
    # df = df.sample(n=1250, random_state=0)
    df['SerialNum'] = '>' + df['id'].astype(str)
    df = df.loc[:,['SerialNum','sequence']]
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)
    df_train.to_csv(f'./Data/fastas/{file}_train.fasta', sep = '\n', index = None,  header = None)
    df_test.to_csv(f'./Data/fastas/{file}_test.fasta', sep = '\n', index = None,  header = None)

# rfam全体のデータセット

## csvをparquet形式に圧縮

In [6]:
input_path = './Data/csvs/rfam.csv'
output_path = input_path.replace('.csv', '.zstd')

df = pd.read_csv(input_path, encoding='UTF-8')
df.to_parquet(output_path, compression='zstd')

## rfam全体を８：2 にtrain-test split

In [7]:
df = pq.read_table('./Data/csvs/rfam.zstd').to_pandas()
df['SerialNum'] = '>' + df['id'].astype(str)
df = df.loc[:,['SerialNum','sequence']]
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)
df_train.to_csv('./Data/fastas/rfam_train.fasta', sep = '\n', index = None,  header = None)
df_test.to_csv('./Data/fastas/rfam_test.fasta', sep = '\n', index = None,  header = None)