In [None]:
import pandas as pd
from Bio import SeqIO
from collections import defaultdict
from scipy.sparse import dok_matrix, csr_matrix
from itertools import product

def generate_kmers(sequence, k):
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

def count_kmers_in_seq(sequence, kmer_list):
    kmer_counts = defaultdict(int)
    for kmer in generate_kmers(sequence, len(next(iter(kmer_list)))):
        if kmer in kmer_list:
            kmer_counts[kmer] += 1
    return kmer_counts

def get_all_possible_kmers(k):
    return {''.join(p) for p in product('ACGT', repeat=k)}

def create_kmer_dataframe(fasta_file, kmer_list):
    records = SeqIO.parse(fasta_file, "fasta")
    headers = []
    kmer_to_index = {kmer: idx for idx, kmer in enumerate(sorted(kmer_list))}
    num_kmers = len(kmer_list)
    num_sequences = sum(1 for _ in SeqIO.parse(fasta_file, "fasta"))
    
    data = dok_matrix((num_sequences, num_kmers), dtype=int)

    for seq_idx, record in enumerate(SeqIO.parse(fasta_file, "fasta")):
        headers.append(record.id)
        sequence = str(record.seq).upper()
        kmer_counts = count_kmers_in_seq(sequence, kmer_list)
        for kmer, count in kmer_counts.items():
            if kmer in kmer_to_index:
                data[seq_idx, kmer_to_index[kmer]] = count

    sparse_df = pd.DataFrame.sparse.from_spmatrix(data.tocsr(), index=headers, columns=sorted(kmer_list))
    return sparse_df

def main(promoter_fasta, non_promoter_fasta, k):
    kmer_list = get_all_possible_kmers(k)
    
    promoter_df = create_kmer_dataframe(promoter_fasta, kmer_list)
    non_promoter_df = create_kmer_dataframe(non_promoter_fasta, kmer_list)
    
    return promoter_df, non_promoter_df

promoter_fasta = 'insert_path_to_your_promoter_fasta_file'
non_promoter_fasta = 'insert_path_to_your_non-promoter_fasta_file'
k = 10  # insert wanted k-length

promoter_df, non_promoter_df = main(promoter_fasta, non_promoter_fasta, k)

print(promoter_df)
print(non_promoter_df)
