In [1]:
%load_ext autoreload
%autoreload 2

import os

from kasearch import EasySearch, PrepareOASdb, prepare_tiny_oas

#### "local_oas_path" needs to be the path to a local version of OAS

In [2]:
local_oas_path = '/path/to/oas/database/'

---------
# Prepare the OAS database for KA-Search

When new data is added to OAS, re-run this code to create an updated pre-aligned OAS to search

In [3]:
prepared_base_oas_folder =  "oasdb_newdate"
prepared_small_oas_folder = "oasdb_small_newdate"
prepared_tiny_oas_folder = "oasdb_tiny_newdate"

## All of OAS

- filter_data: Whether a set of cleaning steps will be used to filter the OAS data.
- local_oas_path: Path to local OAS.
- n_jobs: number of threads to use.

We normally create files with around 5 million (data_file_size) sequences each. Larger file sizes usually result in faster searches, but will also requires more RAM. 

In [None]:
prepare_oas = PrepareOASdb(
    prepared_base_oas_folder, 
    filter_data = False,
    local_oas_path = local_oas_path,
    n_jobs = 30,
)
prepare_oas(data_file_size = 5_000_000)

# of unpaired files: 15631


## Small OAS

In [None]:
prepare_oas = PrepareOASdb(
    prepared_small_oas_folder, 
    filter_data = True,
    local_oas_path = local_oas_path,
    n_jobs = 30,
)
prepare_oas(data_file_size = 5_000_000)

## Tiny OAS

In [None]:
prepare_tiny_oas(prepared_tiny_oas_folder, prepared_small_oas_folder)

-------------------

## Quick test new data sets

In [9]:
seq = "QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVIWYDGSNQYYADSVKGRFTISRDNSKNTLFLQMHSLRAEDTAVYYCARGLTSGRYGMDVWGQGTTVTVSS"

In [11]:
search_results = EasySearch(seq, database_path = prepared_tiny_oas_folder, keep_best_n=5, n_jobs=30)
search_results

Unnamed: 0,sequence,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,v_call,d_call,...,BType,Vaccine,Disease,Subject,Longitudinal,Unique sequences,Total sequences,Isotype,Chain,Identity
0,TTCTTAGAAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGG...,H,F,T,F,T,F,T,IGHV3-33*01,IGHD3-10*01,...,Unsorted-B-Cells,,,Subject-HIP3,no,4402426,16554027,IGHM,Heavy,0.950413
1,GGTTTAGAAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGG...,H,F,T,F,T,T,T,IGHV3-33*01,IGHD2-2*01,...,Unsorted-B-Cells,,,Subject-HIP2,no,4462271,27950084,IGHM,Heavy,0.95
2,GCTATAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGG...,H,F,T,F,T,T,T,IGHV3-33*01,IGHD3-10*01,...,Unsorted-B-Cells,,,Subject-HIP3,no,2535194,19627393,IGHM,Heavy,0.95
3,GCCCTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGG...,H,F,T,F,T,T,T,IGHV3-33*01,IGHD1-7*01,...,Unsorted-B-Cells,,,Subject-HIP2,no,4467381,24573380,IGHM,Heavy,0.95
4,ATCATACAAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGG...,H,F,T,F,T,F,T,IGHV3-33*01,IGHD6-19*01,...,Unsorted-B-Cells,,,Subject-HIP3,no,4397843,5342677,IGHM,Heavy,0.95
