## Import libraries

In [1]:
import pandas as pd
import numpy as np
import math
import shutil
import os
import warnings
import csv

## Load data

In [2]:
# Load the TSV file into a DataFrame
all_probes_results_df = pd.read_csv(
    "../data_exports/forced_alignments_and_analysis_results_dataframe.tsv", sep="\t"
)

print(all_probes_results_df.head())

           run_id        probe_id  \
0  wtchgD00001556  Geno1a_probe_1   
1  wtchgD00001556  Geno1a_probe_2   
2  wtchgD00001556  Geno1a_probe_3   
3  wtchgD00001556  Geno1a_probe_4   
4  wtchgD00001556  Geno1a_probe_5   

                                      probe_sequence  \
0  CCAGCCCCCTGATGGGGGCGACACTCCACCATGAATCACTCCCCTG...   
1  CTTCACGCAGAAAGCGTCTAGCCATGGCGTTAGTATGAGTGTCGTG...   
2  CCCCCCCTCCCGGGAGAGCCATAGTGGTCTGCGGAACCGGTGAGTA...   
3  GGACGACCGGGTCCTTTCTTGGATCAACCCGCTCAATGCCTGGAGA...   
4  CGCAAGACTGCTAGCCGAGTAGTGTTGGGTCGCGAAAGGCCTTGTG...   

                     matched_reference_position_list  \
0  [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...   
1  [62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 7...   
2  [120, 121, 122, 123, 124, 125, 126, 127, 128, ...   
3  [180, 181, 182, 183, 184, 185, 186, 187, 188, ...   
4  [240, 241, 242, 243, 244, 245, 246, 247, 248, ...   

                         reference_msa_position_list  \
0  [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,

## Move relevant nucleotide frequency CSV files to another folder

In [3]:
# Get unique run_id values
unique_run_ids = all_probes_results_df["run_id"].unique()

print("Unique run_id values:")
for run_id in unique_run_ids:
    print(run_id)

# Define the source and destination directories
source_dir = "../azim_data/Summaries/T5"
destination_dir = "../diversity_calculation"

# Copy files for each unique run_id
for run_id in unique_run_ids:
    source_file = os.path.join(
        source_dir, f"{run_id}_closestdedup_vfat_QA_ntfreq_assem.txt"
    )
    destination_file = os.path.join(
        destination_dir, f"{run_id}_closestdedup_vfat_QA_ntfreq_assem.txt"
    )

    if os.path.exists(source_file):
        try:
            shutil.copy2(source_file, destination_file)
            print(f"Copied {source_file} to {destination_file}")
        except Exception as e:
            print(f"Error copying {source_file}: {str(e)}")
    else:
        print(f"File not found: {source_file}")

print("File copying process completed.")

Unique run_id values:
wtchgD00001556
wtchgD00001562
wtchgD00001561
wtchgD00001576
wtchgD00001577
wtchgD00001567
wtchgD00001558
wtchgD00001555
wtchgD00001560
wtchgD00001575
wtchgD00001559
wtchgD00001557
wtchgD00001571
wtchgD00001583
wtchgD00001582
Copied ../azim_data/Summaries/T5/wtchgD00001556_closestdedup_vfat_QA_ntfreq_assem.txt to ../diversity_calculation/wtchgD00001556_closestdedup_vfat_QA_ntfreq_assem.txt
Copied ../azim_data/Summaries/T5/wtchgD00001562_closestdedup_vfat_QA_ntfreq_assem.txt to ../diversity_calculation/wtchgD00001562_closestdedup_vfat_QA_ntfreq_assem.txt
Copied ../azim_data/Summaries/T5/wtchgD00001561_closestdedup_vfat_QA_ntfreq_assem.txt to ../diversity_calculation/wtchgD00001561_closestdedup_vfat_QA_ntfreq_assem.txt
Copied ../azim_data/Summaries/T5/wtchgD00001576_closestdedup_vfat_QA_ntfreq_assem.txt to ../diversity_calculation/wtchgD00001576_closestdedup_vfat_QA_ntfreq_assem.txt
Copied ../azim_data/Summaries/T5/wtchgD00001577_closestdedup_vfat_QA_ntfreq_assem.txt

## Shannon entropy
I think my shannon entropy calculation might be flawed because there might be a bug in how it accidentally accounts for insertions or deletions which I think I don't want. I decided that I am not interested in shannon entropy after all, however, so I won't bother fixing this.
### One example

In [4]:
# File path
file_path = (
    "../diversity_calculation/wtchgD00001556_closestdedup_vfat_QA_ntfreq_assem.txt"
)

# Read the file as a TSV into a DataFrame, skipping the first line
nt_freq_wtchgD00001556 = pd.read_csv(file_path, sep="\t", skiprows=1)

# Set the column names
nt_freq_wtchgD00001556.columns = [
    "Pos",
    "ConsensusNt",
    "Read_Depth",
    "FreqA",
    "FreqT",
    "FreqG",
    "FreqC",
    "FreqDel",
    "FreqInsertion",
    "Insertions(Count)",
]

print(nt_freq_wtchgD00001556.head())

   Pos ConsensusNt  Read_Depth  FreqA  FreqT  FreqG  FreqC  FreqDel  \
0    1           C           7    0.0    0.0    0.0    1.0      0.0   
1    2           C           8    0.0    0.0    0.0    1.0      0.0   
2    3           C          10    0.0    0.0    0.0    1.0      0.0   
3    4           T          10    0.0    1.0    0.0    0.0      0.0   
4    5           G          10    0.0    0.0    1.0    0.0      0.0   

  FreqInsertion Insertions(Count)  
0           NaN               NaN  
1           NaN               NaN  
2           NaN               NaN  
3           NaN               NaN  
4           NaN               NaN  


In [5]:
# def calculate_shannon_entropy(row):
#     frequencies = [row["FreqA"], row["FreqT"], row["FreqG"], row["FreqC"]]
#     # Check if all frequencies are either 0 or 1
#     if all(f in [0, 1] for f in frequencies):
#         return 0
#     # Remove zero frequencies to avoid log(0)
#     frequencies = [f for f in frequencies if f > 0]
#     # Calculate entropy
#     entropy = -sum(f * np.log2(f) for f in frequencies)
#     return max(0, entropy)  # Ensure non-negative value


# # Calculate Shannon entropy for each position
# nt_freq_wtchgD00001556["shannon_entropy"] = nt_freq_wtchgD00001556.apply(
#     calculate_shannon_entropy, axis=1
# )

# # Print the first few rows to verify the new column
# print(nt_freq_wtchgD00001556.head())

### Correspond to original data frame, only one row first

In [6]:
# # 1. Get the run_id for the first row
# first_row = all_probes_results_df.iloc[0]
# run_id = first_row["run_id"]

# # 3. Convert Shannon entropy values to a list
# shannon_entropy_list = nt_freq_wtchgD00001556["shannon_entropy"].tolist()
# print(f"Length of Shannon entropy list: {len(shannon_entropy_list)}")

# # 4. Access the numbers in base_reference_read_position_list
# base_positions = eval(first_row["base_reference_read_position_list"])
# print(f"base_reference_read_position_list in first row: {base_positions}")

# # 5. Use those numbers to access the corresponding Shannon entropy values
# shannon_entropy_values = [
#     shannon_entropy_list[pos - 1] if pos is not None and pos > 0 else np.nan
#     for pos in base_positions
# ]
# print(f"corresponding shannon_entropy_values: {shannon_entropy_values}")

# # 6. Add the Shannon entropy values to the DataFrame
# all_probes_results_df["shannon_entropy"] = np.nan  # Initialize the column with NaN
# all_probes_results_df.at[0, "shannon_entropy"] = str(
#     shannon_entropy_values
# )  # Convert to string for storage

# # 7. Check if the lengths match
# base_list_length = len(base_positions)
# shannon_list_length = len(shannon_entropy_values)
# print(f"Length of base_reference_read_position_list: {base_list_length}")
# print(f"Length of shannon_entropy list: {shannon_list_length}")
# print(f"Lengths match: {base_list_length == shannon_list_length}")

## Nucleotide diversity instead of shannon entropy
After coding the above, I decided that actually nucleotide diversity is a better measure.

In [7]:
# Nick shared this function for calculating nucleotide diversity:
def calculate_d_for_pie_diversity(a_count, t_count, c_count, g_count):
    n = a_count + c_count + g_count + t_count
    if n <= 1:
        return None
    ac = a_count * c_count
    at = a_count * t_count
    ag = a_count * g_count
    tc = t_count * c_count
    tg = t_count * g_count
    cg = c_count * g_count
    sum_nucs = ac + at + ag + tc + tg + cg
    n_half = n / 2.0
    denominator = n_half * (n - 1)
    if denominator != 0:
        d = sum_nucs / denominator
    else:
        return None  # Slight modification from the original function: instead of raising an error, return None for zero denominator
    return d


# Adapt the above function to work with the data format I have
def apply_nucleotide_diversity(row):
    total_depth = row["Read_Depth"]
    a_count = round(row["FreqA"] * total_depth)
    t_count = round(row["FreqT"] * total_depth)
    g_count = round(row["FreqG"] * total_depth)
    c_count = round(row["FreqC"] * total_depth)
    return calculate_d_for_pie_diversity(a_count, t_count, c_count, g_count)


# Apply the function to each row of your DataFrame
nt_freq_wtchgD00001556["nucleotide_diversity"] = nt_freq_wtchgD00001556.apply(
    apply_nucleotide_diversity, axis=1
)

# Sanity checking whether calculation is working as intended

# Print the row where Pos = 139
row_139 = nt_freq_wtchgD00001556[nt_freq_wtchgD00001556["Pos"] == 139].iloc[0]
print("Row data for position 139:")
print(row_139)

# Now let's calculate nucleotide diversity step by step
read_depth = row_139["Read_Depth"]
a_count = round(row_139["FreqA"] * read_depth)
t_count = round(row_139["FreqT"] * read_depth)
g_count = round(row_139["FreqG"] * read_depth)
c_count = round(row_139["FreqC"] * read_depth)

print("\nStep-by-step calculation:")
print(f"Read depth: {read_depth}")
print(f"A count: {a_count}")
print(f"T count: {t_count}")
print(f"G count: {g_count}")
print(f"C count: {c_count}")

# Calculate nucleotide diversity
n = a_count + t_count + g_count + c_count
print(f"\nTotal count (n): {n}")

if n <= 1:
    print("Nucleotide diversity is None (n <= 1)")
else:
    ac = a_count * c_count
    at = a_count * t_count
    ag = a_count * g_count
    tc = t_count * c_count
    tg = t_count * g_count
    cg = c_count * g_count
    sum_nucs = ac + at + ag + tc + tg + cg
    print(f"Sum of pairwise products: {sum_nucs}")

    n_half = n / 2.0
    denominator = n_half * (n - 1)
    print(f"Denominator: {denominator}")

    if denominator != 0:
        d = sum_nucs / denominator
        print(f"Nucleotide diversity (d): {d}")
    else:
        print("Nucleotide diversity is None (denominator = 0)")

Row data for position 139:
Pos                          139
ConsensusNt                    G
Read_Depth                    28
FreqA                   0.035714
FreqT                        0.0
FreqG                   0.964286
FreqC                        0.0
FreqDel                      0.0
FreqInsertion                NaN
Insertions(Count)            NaN
nucleotide_diversity    0.071429
Name: 138, dtype: object

Step-by-step calculation:
Read depth: 28
A count: 1
T count: 0
G count: 27
C count: 0

Total count (n): 28
Sum of pairwise products: 27
Denominator: 378.0
Nucleotide diversity (d): 0.07142857142857142


### Working through one example

In [8]:
# 1. Get the run_id for the first row
first_row = all_probes_results_df.iloc[0]
run_id = first_row["run_id"]

nt_freq_wtchgD00001556["nucleotide_diversity"] = nt_freq_wtchgD00001556.apply(
    apply_nucleotide_diversity, axis=1
)

# 3. Convert nucleotide diversity values to a list
nucleotide_diversity_list = nt_freq_wtchgD00001556["nucleotide_diversity"].tolist()
print(f"Length of nucleotide diversity list: {len(nucleotide_diversity_list)}")

# 4. Access the numbers in base_reference_read_position_list
base_positions = eval(first_row["base_reference_read_position_list"])
print(f"base_reference_read_position_list in first row: {base_positions}")

# 5. Use those numbers to access the corresponding nucleotide diversity values
nucleotide_diversity_values = [
    (
        nucleotide_diversity_list[pos - 1]
        if pos is not None and pos > 0 and pos <= len(nucleotide_diversity_list)
        else np.nan
    )
    for pos in base_positions
]
print(f"corresponding nucleotide_diversity_values: {nucleotide_diversity_values}")

# 6. Add the nucleotide diversity values to the DataFrame
all_probes_results_df["nucleotide_diversity"] = np.nan  # Initialize the column with NaN
all_probes_results_df.at[0, "nucleotide_diversity"] = str(
    nucleotide_diversity_values
)  # Convert to string for storage

# 7. Calculate and add the average nucleotide diversity
avg_diversity = np.nanmean(nucleotide_diversity_values)
all_probes_results_df["avg_nucleotide_diversity"] = (
    np.nan
)  # Initialize the column with NaN
all_probes_results_df.at[0, "avg_nucleotide_diversity"] = avg_diversity

# 8. Check if the lengths match
base_list_length = len(base_positions)
diversity_list_length = len(nucleotide_diversity_values)
print(f"Length of base_reference_read_position_list: {base_list_length}")
print(f"Length of nucleotide_diversity list: {diversity_list_length}")
print(f"Lengths match: {base_list_length == diversity_list_length}")

Length of nucleotide diversity list: 9378
base_reference_read_position_list in first row: [None, None, None, None, None, None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114]
corresponding nucleotide_diversity_values: [nan, nan, nan, nan, nan, nan, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11764705882352941, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

  all_probes_results_df.at[0, "nucleotide_diversity"] = str(


### Generalising to all rows

In [9]:
# Modifying the original function to work with numpy arrays to make computations more efficient
def calculate_d_for_pie_diversity(a_count, t_count, c_count, g_count):
    # Calculate total count of nucleotides
    n = a_count + c_count + g_count + t_count

    # Calculate sum of pairwise products
    # This is equivalent to the original individual calculations (ac, at, ag, tc, tg, cg)
    sum_nucs = (
        a_count * c_count
        + a_count * t_count
        + a_count * g_count
        + t_count * c_count
        + t_count * g_count
        + c_count * g_count
    )

    # Calculate denominator
    denominator = (n / 2.0) * (n - 1)

    # Suppress RuntimeWarnings that may occur during division
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)

        # Vectorized calculation of diversity
        # np.where is used instead of if-else for vectorized operations
        # It returns np.nan when n <= 1 or denominator == 0, equivalent to returning None in the original
        result = np.where((n > 1) & (denominator != 0), sum_nucs / denominator, np.nan)

    # Log the number of positions where diversity couldn't be calculated
    nan_count = np.isnan(result).sum()
    if nan_count > 0:
        print(
            f"Number of positions where diversity couldn't be calculated: {nan_count}"
        )

    return result


def process_run_id(run_id, diversity_dir, base_positions_dict):
    file_path = os.path.join(
        diversity_dir, f"{run_id}_closestdedup_vfat_QA_ntfreq_assem.txt"
    )

    if not os.path.exists(file_path):
        print(f"File not found for run_id {run_id}: {file_path}")
        return pd.DataFrame()

    # Read the file using csv.reader to handle potential formatting issues
    with open(file_path, "r") as f:
        reader = csv.reader(f, delimiter="\t")
        next(reader)  # Skip the header row
        data = []
        for row in reader:
            if len(row) >= 7:  # Ensure we have at least the columns we need
                data.append(row[:7])  # Only take the first 7 columns
            else:
                print(f"Skipping malformed row in file for run_id {run_id}: {row}")

    # Create DataFrame from the cleaned data
    columns = ["Pos", "ConsensusNt", "Read_Depth", "FreqA", "FreqT", "FreqG", "FreqC"]
    nt_freq_df = pd.DataFrame(data, columns=columns)

    # Convert columns to appropriate types
    nt_freq_df["Pos"] = pd.to_numeric(nt_freq_df["Pos"], errors="coerce")
    nt_freq_df["Read_Depth"] = pd.to_numeric(nt_freq_df["Read_Depth"], errors="coerce")
    for col in ["FreqA", "FreqT", "FreqG", "FreqC"]:
        nt_freq_df[col] = pd.to_numeric(nt_freq_df[col], errors="coerce")

    # Calculate nucleotide diversity
    read_depth = np.nan_to_num(
        nt_freq_df["Read_Depth"].values, nan=0, posinf=0, neginf=0
    )
    a_count = np.round(
        np.nan_to_num(nt_freq_df["FreqA"].values, nan=0, posinf=0, neginf=0)
        * read_depth
    ).astype(int)
    t_count = np.round(
        np.nan_to_num(nt_freq_df["FreqT"].values, nan=0, posinf=0, neginf=0)
        * read_depth
    ).astype(int)
    g_count = np.round(
        np.nan_to_num(nt_freq_df["FreqG"].values, nan=0, posinf=0, neginf=0)
        * read_depth
    ).astype(int)
    c_count = np.round(
        np.nan_to_num(nt_freq_df["FreqC"].values, nan=0, posinf=0, neginf=0)
        * read_depth
    ).astype(int)

    nt_freq_df["nucleotide_diversity"] = calculate_d_for_pie_diversity(
        a_count, t_count, g_count, c_count
    )

    results = []
    for idx, base_positions in base_positions_dict[run_id].items():
        nucleotide_diversity_values = nt_freq_df.loc[
            nt_freq_df["Pos"].isin(base_positions), "nucleotide_diversity"
        ].tolist()

        avg_diversity = np.nanmean(nucleotide_diversity_values)
        results.append(
            {
                "run_id": run_id,
                "index": idx,
                "nucleotide_diversity": str(nucleotide_diversity_values),
                "avg_nucleotide_diversity": avg_diversity,
            }
        )

    return pd.DataFrame(results)


# Prepare a dictionary of base positions for each run_id
base_positions_dict = {}
for idx, row in all_probes_results_df.iterrows():
    run_id = row["run_id"]
    if run_id not in base_positions_dict:
        base_positions_dict[run_id] = {}
    base_positions_dict[run_id][idx] = eval(row["base_reference_read_position_list"])


# Process each unique run_id
diversity_dir = "../diversity_calculation"
unique_run_ids = all_probes_results_df["run_id"].unique()
results = pd.concat(
    [
        process_run_id(run_id, diversity_dir, base_positions_dict)
        for run_id in unique_run_ids
    ]
)

# Merge results back to the original DataFrame
results.set_index("index", inplace=True)

# Remove existing columns if they exist
columns_to_update = ["nucleotide_diversity", "avg_nucleotide_diversity"]
for col in columns_to_update:
    if col in all_probes_results_df.columns:
        all_probes_results_df = all_probes_results_df.drop(columns=[col])

# Now join the new results
all_probes_results_df = all_probes_results_df.join(results[columns_to_update])

print("Efficient nucleotide diversity calculation completed for all rows.")
print(
    all_probes_results_df[
        ["run_id", "nucleotide_diversity", "avg_nucleotide_diversity"]
    ].head()
)

Number of positions where diversity couldn't be calculated: 1
Number of positions where diversity couldn't be calculated: 2
Number of positions where diversity couldn't be calculated: 1
Number of positions where diversity couldn't be calculated: 2
Number of positions where diversity couldn't be calculated: 2
Number of positions where diversity couldn't be calculated: 9
Number of positions where diversity couldn't be calculated: 2
Number of positions where diversity couldn't be calculated: 2
Number of positions where diversity couldn't be calculated: 2
Number of positions where diversity couldn't be calculated: 1
Number of positions where diversity couldn't be calculated: 3
Number of positions where diversity couldn't be calculated: 85
Number of positions where diversity couldn't be calculated: 2
Number of positions where diversity couldn't be calculated: 54
Number of positions where diversity couldn't be calculated: 8
Efficient nucleotide diversity calculation completed for all rows.
 

## Save dataframe to share with colleagues

In [10]:
# Save the updated DataFrame
all_probes_results_df.to_csv(
    "../data_exports/updated_forced_alignments_analysis_with_nucleotide_diversity_all_probes_results_df.tsv",
    sep="\t",
    index=False,
)