Code to generate Total hits of gRNA 

In [None]:
import os
from Bio import SeqIO
import pandas as pd

# Define paths
base_dir = "Path/to/LTR2B_gRNA"
gRNA_fasta_path = os.path.join(base_dir, "Files", "gRNA.fasta")
hg38_fasta_path = os.path.join(base_dir, "Files", "hg38.fa")
output_sam_path = os.path.join(base_dir, "Files", "alignment_gRNA.sam")

# Step 3: Parse the SAM file to count the hits for each gRNA
def count_hits(sam_file):
    hit_counts = {}
    with open(sam_file, 'r') as f:
        for line in f:
            if line.startswith('@'):  # Skip header lines
                continue
            fields = line.split("\t")
            gRNA_name = fields[0]  # QNAME field
            if gRNA_name in hit_counts:
                hit_counts[gRNA_name] += 1
            else:
                hit_counts[gRNA_name] = 1
    return hit_counts

# Step 4: Map gRNA names to sequences from the FASTA file
def map_gRNA_names_to_sequences(fasta_file):
    gRNA_map = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        gRNA_map[record.id] = str(record.seq)
    return gRNA_map

# Check if the SAM file exists
if not os.path.exists(output_sam_path):
    raise FileNotFoundError(f"{output_sam_path} does not exist. Ensure you have run the alignment step.")

# Count the hits
gRNA_hits = count_hits(output_sam_path)

# Map gRNA names to sequences
gRNA_sequences = map_gRNA_names_to_sequences(gRNA_fasta_path)

# Replace gRNA names with sequences in the hits data
gRNA_hits_with_sequences = {gRNA_sequences[gRNA_name]: hit_count for gRNA_name, hit_count in gRNA_hits.items() if gRNA_name in gRNA_sequences}

# Step 5: Display the results
gRNA_hits_df = pd.DataFrame(list(gRNA_hits_with_sequences.items()), columns=['gRNA Sequence', 'Hit Count'])
gRNA_hits_df = gRNA_hits_df.sort_values(by='Hit Count', ascending=False)

# Display the result
print(gRNA_hits_df)

# Save the result to a CSV file
gRNA_hits_df.to_csv(os.path.join(base_dir, "Files", 'gRNA_hits_summary.csv'), index=False)

print("gRNA hits summary saved to gRNA_hits_summary.csv")


First edit alignment_match_counts.txt for no same hits and then to compare vs overall hits to get offtargets %

In [12]:
import pandas as pd

# Load the data from the file
file_path = 'C:/Users/Bisha/Desktop/LTR2B_gRNA/Files/alignment_match_counts.txt'
data = pd.read_csv(file_path, sep='\t', header=None, names=['Sequence', 'Matches'])

# Remove duplicates
cleaned_data = data.drop_duplicates()

# Save the cleaned data to a new file
cleaned_file_path = 'C:/Users/Bisha/Desktop/LTR2B_gRNA/Files/cleaned_alignment_match_counts.txt'
cleaned_data.to_csv(cleaned_file_path, sep='\t', index=False, header=False)

print(f"Cleaned data saved to {cleaned_file_path}")


Cleaned data saved to C:/Users/Bisha/Desktop/cleaned_alignment_match_counts.txt
