# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
from tqdm import tqdm_notebook, tqdm
from collections import defaultdict
import numpy as np
from caduceus.Sparse_vector.sparse_vector import SparseVector
import os
from joblib import load, dump, Parallel, delayed
from torch.utils import data

In [4]:
import matplotlib.pyplot as plt

from tqdm import trange

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import StratifiedKFold

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data Preparation & Dataset

In [8]:
chrom_names = [f'chr{i}' for i in list(range(1, 23)) + ['X', 'Y','M']]

def chrom_reader(chrom):
    files = sorted([i for i in os.listdir(f'./caduceus/z_dna/hg38_dna/') if f"{chrom}_" in i])
    return ''.join([load(f"./caduceus/z_dna/hg38_dna/{file}") for file in files])

DNA = {chrom:chrom_reader(chrom) for chrom in tqdm(chrom_names)}
lens_of_chroms = {chrom: len(DNA[chrom]) for chrom in DNA}
ZDNA = load('./caduceus/ZDNA_cousine.pkl')

In [10]:
class Dataset(data.Dataset):
    def __init__(self, chroms,
                 dna_source,
                 labels_source, intervals, lrp_feat=[]):
        self.chroms = chroms
        #self.features = features
        self.dna_source = dna_source
        #self.features_source = features_source
        self.labels_source = labels_source
        self.intervals = intervals
        self.le = LabelBinarizer().fit(np.array([["A"], ["C"], ["T"], ["G"]]))
        self.lrp_feat = lrp_feat

    def __len__(self):
        return len(self.intervals)

    def __getitem__(self, index):
        interval = self.intervals[index]
        chrom = interval[0]
        begin = int(interval[1])
        end = int(interval[2])
        dna_OHE = self.le.transform(list(self.dna_source[chrom][begin:end].upper()))
        
        dna_letters = list(self.dna_source[chrom][begin:end].upper())

        #X = dna_OHE.astype(np.float32)
        X = "".join(dna_letters)
        y = self.labels_source[interval[0]][interval[1]: interval[2]]
        if len(self.lrp_feat) > 0:
            X = X[:,np.sort(self.lrp_feat)]

        return (X, y)

  0%|                                                    | 0/25 [01:26<?, ?it/s]


In [12]:
from sklearn.model_selection import train_test_split

equalized = load('./caduceus/hg38_kouzine_intervals_512-yana.pkl')
equalized = [[inter[0], int(inter[1]), int(inter[2])] for inter in equalized]

train_intervals, test_intervals = train_test_split(equalized, test_size=0.2, shuffle=True) 
train_intervals, val_intervals = train_test_split(train_intervals, test_size=0.3, shuffle=True)

In [46]:
np.random.seed(42)
params = {'batch_size':32,#32,
          'num_workers':4,#16,
          'shuffle':True}


train_dataset = Dataset(chrom_names, 
                       DNA, 
                       ZDNA, train_intervals, lrp_feat = [])

test_dataset = Dataset(chrom_names, 
                       DNA, 
                       ZDNA, test_intervals, lrp_feat = [])

loader_train = data.DataLoader(train_dataset, **params)
loader_test = data.DataLoader(test_dataset, **params)

In [46]:
def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

## Модель учитель

In [17]:
# zdna
from transformers import AutoModelForMaskedLM, AutoTokenizer
# from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForTokenClassification

def load_model(checkpoint=None):
    model_name = "kuleshov-group/caduceus-ph_seqlen-1k_d_model-256_n_layer-4_lr-8e-3"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    model = AutoModelForMaskedLM.from_pretrained(model_name, 
                                                        trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    model.lm_head = nn.Sequential(
        nn.Linear(in_features=256, out_features=2, bias=False)
    )
    
    if checkpoint is not None:
        model.load_state_dict(torch.load(checkpoint, weights_only=True))
        model.eval()
    return model, tokenizer

In [18]:
teacher, tokenizer = load_model('caduceus/zdna_2exp_5ep_0.863.pt')

The repository for kuleshov-group/caduceus-ph_seqlen-1k_d_model-256_n_layer-4_lr-8e-3 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/kuleshov-group/caduceus-ph_seqlen-1k_d_model-256_n_layer-4_lr-8e-3.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


In [19]:
teacher

CaduceusForMaskedLM(
  (caduceus): Caduceus(
    (backbone): CaduceusMixerModel(
      (embeddings): CaduceusEmbeddings(
        (word_embeddings): Embedding(16, 256)
      )
      (layers): ModuleList(
        (0-3): 4 x Block(
          (mixer): BiMambaWrapper(
            (mamba_fwd): Mamba(
              (in_proj): Linear(in_features=256, out_features=1024, bias=False)
              (conv1d): Conv1d(512, 512, kernel_size=(4,), stride=(1,), padding=(3,), groups=512)
              (act): SiLU()
              (x_proj): Linear(in_features=512, out_features=48, bias=False)
              (dt_proj): Linear(in_features=16, out_features=512, bias=True)
              (out_proj): Linear(in_features=512, out_features=256, bias=False)
            )
            (mamba_rev): Mamba(
              (in_proj): Linear(in_features=256, out_features=1024, bias=False)
              (conv1d): Conv1d(512, 512, kernel_size=(4,), stride=(1,), padding=(3,), groups=512)
              (act): SiLU()
            

In [20]:
model_parameters = filter(lambda p: p.requires_grad, teacher.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
params

1935104

## Модель ученик

In [21]:
from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer
import sys

from mamba_lrp.model.mamba_huggingface import ModifiedMambaForCausalLM
from mamba_lrp.model.utils import *
from mamba_lrp.lrp.utils import relevance_propagation
from mamba_lrp.dataset.general_dataset import get_sst_dataset
import torch

In [150]:
from hf_mamba_classification import MambaForSequenceClassification # https://github.com/getorca/mamba_for_sequence_classification/blob/main/src/hf_mamba_classification.py

model_path = 'state-spaces/mamba-130m-hf'

student = MambaForCausalLM.from_pretrained(
    model_path, 
    num_labels=2, 
    use_cache=True,
    intermediate_size=512,
    hidden_size=256,
    num_hidden_layers=4,
    ignore_mismatched_sizes=True
)
resize_token_embeddings(student, len(tokenizer))
student.lm_head = torch.nn.Linear(256, 2, bias=True)
student.load_state_dict(torch.load('mamba-gue-for-distil.pt'), strict=False)

Some weights of the model checkpoint at state-spaces/mamba-130m-hf were not used when initializing MambaForCausalLM: ['backbone.layers.10.mixer.A_log', 'backbone.layers.10.mixer.D', 'backbone.layers.10.mixer.conv1d.bias', 'backbone.layers.10.mixer.conv1d.weight', 'backbone.layers.10.mixer.dt_proj.bias', 'backbone.layers.10.mixer.dt_proj.weight', 'backbone.layers.10.mixer.in_proj.weight', 'backbone.layers.10.mixer.out_proj.weight', 'backbone.layers.10.mixer.x_proj.weight', 'backbone.layers.10.norm.weight', 'backbone.layers.11.mixer.A_log', 'backbone.layers.11.mixer.D', 'backbone.layers.11.mixer.conv1d.bias', 'backbone.layers.11.mixer.conv1d.weight', 'backbone.layers.11.mixer.dt_proj.bias', 'backbone.layers.11.mixer.dt_proj.weight', 'backbone.layers.11.mixer.in_proj.weight', 'backbone.layers.11.mixer.out_proj.weight', 'backbone.layers.11.mixer.x_proj.weight', 'backbone.layers.11.norm.weight', 'backbone.layers.12.mixer.A_log', 'backbone.layers.12.mixer.D', 'backbone.layers.12.mixer.conv1d

<All keys matched successfully>

In [24]:
student

MambaForCausalLM(
  (backbone): MambaModel(
    (embeddings): Embedding(12, 256)
    (layers): ModuleList(
      (0-3): 4 x MambaBlock(
        (norm): MambaRMSNorm(256, eps=1e-05)
        (mixer): MambaMixer(
          (conv1d): Conv1d(512, 512, kernel_size=(4,), stride=(1,), padding=(3,), groups=512)
          (act): SiLU()
          (in_proj): Linear(in_features=256, out_features=1024, bias=False)
          (x_proj): Linear(in_features=512, out_features=80, bias=False)
          (dt_proj): Linear(in_features=48, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=256, bias=False)
        )
      )
    )
    (norm_f): MambaRMSNorm(256, eps=1e-05)
  )
  (lm_head): Linear(in_features=256, out_features=2, bias=True)
)

In [25]:
model_parameters = filter(lambda p: p.requires_grad, student.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
params

1886978

In [151]:
student.cuda()
teacher.cuda()

MambaForCausalLM(
  (backbone): MambaModel(
    (embeddings): Embedding(12, 256)
    (layers): ModuleList(
      (0-3): 4 x MambaBlock(
        (norm): MambaRMSNorm(256, eps=1e-05)
        (mixer): MambaMixer(
          (conv1d): Conv1d(512, 512, kernel_size=(4,), stride=(1,), padding=(3,), groups=512)
          (act): SiLU()
          (in_proj): Linear(in_features=256, out_features=1024, bias=False)
          (x_proj): Linear(in_features=512, out_features=80, bias=False)
          (dt_proj): Linear(in_features=48, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=256, bias=False)
        )
      )
    )
    (norm_f): MambaRMSNorm(256, eps=1e-05)
  )
  (lm_head): Linear(in_features=256, out_features=2, bias=True)
)

In [32]:
with torch.no_grad():
    outputs = teacher(**tokenizer(train_dataset[0][0], return_tensors='pt', add_special_tokens=False).to(device))
outputs.logits.shape

torch.Size([1, 512, 2])

In [64]:
with torch.no_grad():
    outputs = student(**tokenizer(train_dataset[0][0], return_tensors='pt', add_special_tokens=False).to(device))
outputs.logits

tensor([[[-0.9602, -0.0840],
         [ 0.2702, -0.0568],
         [-0.0995, -0.5179],
         ...,
         [ 0.1558,  0.1106],
         [-0.0428, -1.2408],
         [ 0.2647, -0.6359]]], device='cuda:0')

## Train

In [152]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import matthews_corrcoef, f1_score, accuracy_score

def compute_metrics(predictions, labels):
    predictions = np.argmax(predictions[0], axis=-1)
    return {'accuracy': accuracy_score(labels, predictions),
            'mcc': matthews_corrcoef(labels, predictions),
            'f1': f1_score(labels, predictions)
           }

def eval(n=500):
    student.eval()
    mcc = []
    acc = []
    f1 = []
    i = 0
    for batch in tqdm(test_dataset):
        if batch[1].sum() == 0:
            continue
        if i == n:
            break
        inputs = torch.Tensor(tokenizer(batch[0], return_tensors='pt', add_special_tokens=False)['input_ids']).to(device)
        labels = torch.Tensor(batch[1]).to(device)
        
        # Получаем логиты от обеих моделей
        with torch.no_grad():
            student_logits = student(inputs).logits
    
        metrics = compute_metrics(student_logits.cpu(), labels.cpu())
        mcc.append(metrics['mcc'])
        acc.append(metrics['accuracy'])
        f1.append(metrics['f1'])
        i += 1
    
    print(np.mean(mcc), np.mean(f1), np.mean(acc))
    return np.mean(mcc), np.mean(f1), np.mean(acc)
    
# class DistillationLoss(nn.Module):
#     def __init__(self, alpha=0.5, temperature=2.0):
#         super(DistillationLoss, self).__init__()
#         self.alpha = alpha  # Вес для стандартной кросс-энтропии
#         self.temperature = temperature  # Температура для дистилляции (упрощает логиты учителя)

#     def forward(self, student_logits, teacher_logits, labels):
#         # Стандартная кросс-энтропия (потеря на реальных метках)
#         ce_loss = F.cross_entropy(student_logits, labels)
        
#         # Потеря дистилляции
#         student_probs = F.log_softmax(student_logits / self.temperature, dim=-1)
#         teacher_probs = F.softmax(teacher_logits / self.temperature, dim=-1)
        
#         distillation_loss = F.kl_div(student_probs, teacher_probs, reduction='batchmean') * (self.temperature ** 2)
        
#         # Общая потеря (комбинированная)
#         loss = self.alpha * ce_loss + (1.0 - self.alpha) * distillation_loss
#         return loss

class DistillationLoss(nn.Module):
    def __init__(self, alpha=0.5, temperature=2.0):
        super(DistillationLoss, self).__init__()
        self.alpha = alpha  # Вес для стандартной кросс-энтропии
        self.temperature = temperature  # Температура для дистилляции (упрощает логиты учителя)

    def forward(self, student_logits, teacher_logits, labels):
        # Преобразуем logits в нужную форму
        # student_logits: (batch_size, seq_len, num_classes)
        # labels: (batch_size, seq_len)
        
        # Стандартная кросс-энтропия (потеря на реальных метках)
        # Мы учитываем, что метки имеют форму (batch_size, seq_len), и каждая метка — это индекс класса для каждого токена
        ce_loss = F.cross_entropy(student_logits.view(-1, student_logits.size(-1)), labels.view(-1), reduction='mean')
        
        # Потеря дистилляции
        student_probs = F.log_softmax(student_logits / self.temperature, dim=-1)
        teacher_probs = F.softmax(teacher_logits / self.temperature, dim=-1)
        
        # KL-дивергенция между логитами ученика и учителя (по всем токенам и всем классам)
        distillation_loss = F.kl_div(student_probs.view(-1, student_probs.size(-1)), teacher_probs.view(-1, teacher_probs.size(-1)), reduction='mean') * (self.temperature ** 2)
        
        # Общая потеря (комбинированная)
        loss = self.alpha * ce_loss + (1.0 - self.alpha) * distillation_loss
        return loss


def train_student_model(student_model, teacher_model, train_loader, optimizer, device, num_epochs=10, alpha=0.5, temperature=2.0):
    distillation_loss_fn = DistillationLoss(alpha=alpha, temperature=temperature)
    best_f1 = 0
    
    # Переводим модели в режим тренировки
    student_model.train()
    teacher_model.eval()  # Модель учителя в режиме оценки
    
    for epoch in range(num_epochs):
        gc.collect()
        torch.cuda.empty_cache() 
        
        total_loss = 0.0
        correct = 0
        total = 0
        
        for batch in tqdm(train_loader):
            inputs = tokenizer(batch[0], return_tensors='pt', add_special_tokens=False).to(device)
            labels = batch[1].to(device)
            
            # Получаем логиты от обеих моделей
            with torch.no_grad():  # Учитель не обучается, только инференс
                teacher_logits = teacher_model(**inputs).logits
            
            student_logits = student_model(**inputs).logits
            # print(teacher_logits.shape, student_logits.shape, labels.shape)
            # print(student_logits)
            # print(compute_metrics(student_logits.cpu().detach().numpy(), labels.type(torch.LongTensor).cpu()))
                  
            # Вычисляем потерю
            student_logits = student_logits.view(-1, student_logits.size(-1)).to(device)
            labels = labels.view(-1).type(torch.LongTensor).to(device)
            # ce_loss = F.cross_entropy(student_logits, labels, reduction='mean')
            
            loss = distillation_loss_fn(student_logits, teacher_logits, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Оценка точности
            _, predicted = torch.max(student_logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        epoch_loss = total_loss / len(train_loader)
        mcc, f1, acc = eval()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, F1: {f1:.2f}, MCC: {mcc:.2f}, Accuracy: {acc:.2f}')
        if best_f1 < f1:
            best_f1 = f1
            torch.save(student_model.state_dict(), f'zdna_f1_{best_f1}.pt')


# Пример использования
# Предположим, что у нас есть модели student_model и teacher_model, а также train_loader с данными

# optimizer = optim.Adam(student.parameters(), lr=1e-4)
optimizer = torch.optim.Adam(student.parameters(), lr=1e-3, betas=(0.95, 0.9))

# Устройство (GPU или CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Переносим модели на устройство
student.to(device)
teacher.to(device)

# Обучение модели ученика
train_student_model(student, teacher, loader_test, optimizer, device, num_epochs=5)

100%|█████████████████████████████████████████| 753/753 [01:56<00:00,  6.47it/s]
  8%|██▊                                  | 1858/24071 [00:04<00:53, 417.58it/s]


0.5009487227569068 0.4996799522040411 0.95715234375
Epoch [1/5], Loss: 0.0471, F1: 0.50, MCC: 0.50, Accuracy: 0.96


100%|█████████████████████████████████████████| 753/753 [02:02<00:00,  6.16it/s]
  8%|██▊                                  | 1858/24071 [00:04<00:55, 400.48it/s]


0.47344931111990574 0.43772663388393235 0.960171875
Epoch [2/5], Loss: 0.0412, F1: 0.44, MCC: 0.47, Accuracy: 0.96


100%|█████████████████████████████████████████| 753/753 [02:03<00:00,  6.08it/s]
  8%|██▊                                  | 1858/24071 [00:04<00:57, 383.66it/s]


0.43267865028194497 0.3939089441161382 0.95799609375
Epoch [3/5], Loss: 0.0401, F1: 0.39, MCC: 0.43, Accuracy: 0.96


100%|█████████████████████████████████████████| 753/753 [02:07<00:00,  5.92it/s]
  8%|██▊                                  | 1858/24071 [00:05<01:03, 352.54it/s]


0.5057119381368708 0.47586584211463184 0.9613125
Epoch [4/5], Loss: 0.0395, F1: 0.48, MCC: 0.51, Accuracy: 0.96


100%|█████████████████████████████████████████| 753/753 [02:08<00:00,  5.88it/s]
  8%|██▊                                  | 1858/24071 [00:04<00:59, 373.90it/s]

0.480924048030009 0.4468616728679255 0.96073828125
Epoch [5/5], Loss: 0.0389, F1: 0.45, MCC: 0.48, Accuracy: 0.96





In [155]:
# Пример использования
# Предположим, что у нас есть модели student_model и teacher_model, а также train_loader с данными

# optimizer = optim.Adam(student.parameters(), lr=1e-4)
optimizer = torch.optim.Adam(student.parameters(), lr=2e-4, betas=(0.95, 0.9))

# Устройство (GPU или CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Переносим модели на устройство
student.to(device)
teacher.to(device)

# Обучение модели ученика
train_student_model(student, teacher, loader_test, optimizer, device, num_epochs=3)

100%|█████████████████████████████████████████| 753/753 [01:59<00:00,  6.28it/s]
  8%|██▊                                  | 1858/24071 [00:05<01:02, 356.78it/s]


Epoch [1/3], Loss: 0.0371, F1: 0.52, MCC: 0.55, Accuracy: 0.96


100%|█████████████████████████████████████████| 753/753 [02:06<00:00,  5.97it/s]
  8%|██▊                                  | 1858/24071 [00:04<00:56, 391.99it/s]


Epoch [2/3], Loss: 0.0366, F1: 0.52, MCC: 0.55, Accuracy: 0.96


100%|█████████████████████████████████████████| 753/753 [02:09<00:00,  5.82it/s]
  8%|██▊                                  | 1858/24071 [00:05<01:00, 365.50it/s]

Epoch [3/3], Loss: 0.0363, F1: 0.52, MCC: 0.55, Accuracy: 0.96





In [158]:
student.load_state_dict(torch.load('zdna_f1_0.523.pt'))

<All keys matched successfully>

In [160]:
optimizer = torch.optim.Adam(student.parameters(), lr=2e-4, betas=(0.95, 0.9))

# Обучение модели ученика
# alpha=0.85
train_student_model(student, teacher, loader_test, optimizer, device, num_epochs=3)

100%|█████████████████████████████████████████| 753/753 [01:57<00:00,  6.41it/s]
  8%|██▊                                  | 1858/24071 [00:04<00:57, 386.09it/s]


Epoch [1/3], Loss: 0.0367, F1: 0.51, MCC: 0.54, Accuracy: 0.96


100%|█████████████████████████████████████████| 753/753 [02:03<00:00,  6.09it/s]
  8%|██▊                                  | 1858/24071 [00:05<01:00, 366.04it/s]


Epoch [2/3], Loss: 0.0363, F1: 0.53, MCC: 0.55, Accuracy: 0.96


100%|█████████████████████████████████████████| 753/753 [02:05<00:00,  5.99it/s]
  8%|██▊                                  | 1858/24071 [00:04<00:59, 375.33it/s]

Epoch [3/3], Loss: 0.0359, F1: 0.54, MCC: 0.56, Accuracy: 0.96





### Eval

In [147]:
eval()

  8%|██▊                                  | 1858/24071 [00:04<00:54, 409.06it/s]

0.5118393718357392 0.4904617857087976 0.961109375





(0.5118393718357392, 0.4904617857087976, 0.961109375)

# Interpret student LRP

In [82]:
from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer
import sys

from mamba_lrp.model.mamba_huggingface import ModifiedMambaForCausalLM
from mamba_lrp.model.utils import *
from mamba_lrp.lrp.utils import relevance_propagation
from mamba_lrp.dataset.general_dataset import get_sst_dataset
import torch

In [83]:
modified_model = ModifiedMambaForCausalLM(student, is_fast_forward_available=False)
modified_model.eval()
pretrained_embeddings = student.backbone.embeddings

In [124]:
from captum.attr import visualization as viz

def interpret(seq):
    # print(seq[0])
    inputs = tokenizer(seq[0], return_tensors='pt', add_special_tokens=False)['input_ids']
    # print(inputs)
    inputs = inputs.long().to(device)
    label = torch.tensor(seq[1]).unsqueeze(0).long().to(device)
    
    embeddings = pretrained_embeddings(inputs)
    
    R, prediction = relevance_propagation(
        model=modified_model,
        embeddings=embeddings,
        targets=label,
        n_classes=2
    )
    
    tokens = []
    for id in inputs[0]:
        tokens.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens([id.item()])))
    attributions = R[0]
    attributions = attributions / attributions.max()

    # print(attributions.shape)
    # print(student(inputs).logits.shape)
    return attributions
    # Visualize the attributions
    viz.visualize_text([viz.VisualizationDataRecord(
        attributions,
        torch.max(student(inputs).logits, dim=1).values.item(),
        torch.argmax(student(inputs).logits, dim=1).item(),
        true_class=label.item(),
        attr_class=label.item(),
        attr_score=attributions.sum(),
        raw_input_ids=tokens,
        convergence_score=None
    )])
    return attributions


In [125]:
attributions = interpret(test_dataset[0])

In [126]:
attributions

array([ 2.20194284e-04, -2.36713495e-05, -1.27956464e-05,  1.67269609e-05,
        6.60531850e-06, -3.29011018e-05, -2.86573595e-05, -3.00497468e-05,
        1.54890149e-05,  8.10637630e-06,  1.55048201e-05,  4.10308239e-05,
       -2.15316941e-05,  3.26815425e-05, -3.06306247e-05,  2.63177717e-05,
        3.30987386e-05,  2.67635914e-05, -4.08664200e-05,  4.08500782e-05,
        3.92948386e-05,  4.58687828e-05, -1.24772041e-05,  4.25325597e-05,
        5.45747607e-05,  3.50149276e-05, -2.75278453e-05,  4.70651430e-05,
       -2.39144538e-05,  5.84691079e-05, -3.96029754e-05, -1.99875267e-05,
        2.85731512e-05,  4.22936319e-05,  8.68133575e-05,  7.84309086e-05,
        9.10606468e-05, -4.54844158e-05,  7.41585973e-05,  6.49573849e-05,
        5.89480042e-05,  9.48274173e-05,  1.13000133e-04, -1.31915785e-05,
        8.37759435e-05, -9.08606034e-06, -1.26985578e-05, -3.50881710e-05,
        2.30031492e-05, -2.92240529e-05,  3.38749196e-05,  3.69917943e-05,
        4.18177733e-05,  

# IG

In [161]:
seqs = ['GCGCG', 'CACGC', 'CGC', 'GCGC', 'TGCGC', 'GCGTG', 'CACGC']
scores = [0.8230, 0.6916, 0.6564, 0.6036, 0.5462, 0.5042, 0.4842]
counts = [339, 3201, 131, 217, 108, 827, 2121]
pd.DataFrame({'kmer': seqs, 'Score': scores, 'Count': counts})

Unnamed: 0,kmer,Score,Count
0,GCGCG,0.823,339
1,CACGC,0.6916,3201
2,CGC,0.6564,131
3,GCGC,0.6036,217
4,TGCGC,0.5462,108
5,GCGTG,0.5042,827
6,CACGC,0.4842,2121
