Plotting hieght traces and finding ways of automatically finding DNA damage.

The aim of this notebook is to create a plotting pipeline for DNA height traces and find robust ways of highlight DNA damage.

In [None]:
## installing packages.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import statistics as stats
import json
from scipy.signal import savgol_filter
from scipy.signal import find_peaks
import random

## All new packages as needed to be installed here!

In [None]:
## importing the height trace file which is a .json file.

with open('/Users/t/Documents/pIC0z/20240112_pIC0z_rel/pIC0z_rel_single_molecules/output_height_trace_branch/height_stats.json') as f:
    df = json.load(f)

print(df)

print(type(df))

image_dict = df["/Users/t/Documents/pIC0z/20240112_pIC0z_rel/pIC0z_rel_single_molecules/20240112_200nm_6ng_pIC0z_notel_topo70_Mg_Ni.0_00020"]
print()
print(image_dict)

mol_dict = image_dict['0']
print()
print(mol_dict)

## here the values for height and distance have been applied.

heights, distances = mol_dict
print(heights)
print(distances)

In [None]:
## having a quick look at the 'raw' height trace plot

plt.plot(distances, heights)

## neverything is in meters - this will need to be changed to nanometers. 
## the size/sccaling of the graph is not great, it is very compact.
## 

In [None]:
## the data has been scaled to nanometers which is easier to interpret and the scale of the graph has been lengthened to decompress the data.
## I have also forced the height and distance variables in numpy arrays for ease going forward...

heights_in_nm = []
for i in heights:
    heights_in_nm.append(i * 1E9)
    
heights_in_nm = np.array(heights_in_nm)

distances_in_nm = []
for i in distances:
    distances_in_nm.append(i * 1E9)

distances_in_nm = np.array(distances_in_nm)

plt.figure(figsize=(16,4))
plt.plot(distances_in_nm, heights_in_nm)

In [None]:
## I want to show the change in height from the median (median as the data is not normally distributed). 
## Need to calculate the median and subtract it from all poitns to make the median 'effectively' zero.

print(stats.median(heights_in_nm))

## plots the nanometer adjusted heights with a median line.

plt.figure(figsize=(16,4))
plt.plot(distances_in_nm, heights_in_nm, color = "black")
plt.axhline(stats.median(heights_in_nm), color = "red", linestyle = "dashed", label = f"median height = {stats.median(heights_in_nm):.2f} nm")
plt.legend()

## creates a new array of the heights with the median subtracted.
med_adj_heights = []
for i in heights_in_nm:
    med_adj_heights.append(i - stats.median(heights_in_nm))

med_adj_heights = np.array(med_adj_heights)

## plots the graph of median adjusted height values.

plt.figure(figsize=(16,4))
plt.plot(distances_in_nm, med_adj_heights, color = "black")
plt.axhline(stats.median(med_adj_heights), color = "red", linestyle = "dashed", label = f" Median height of DNA = {stats.median(heights_in_nm):.2f} nm")
plt.legend(loc = "upper center")

In [None]:
## add the proper labels you greb.

plt.figure(figsize=(16,4))
plt.plot(distances_in_nm, med_adj_heights, color = "grey")
plt.axhline(stats.median(med_adj_heights), color = "red", linestyle = "dashed", label = f" Median height of DNA = {stats.median(heights_in_nm):.2f} nm")
plt.legend(loc = "upper center")
plt.title("DNA height trace using median adjusted values")
plt.xlabel("Distance from the trace origin (nm)")
plt.ylabel("Change in DNA height from the median")
plt.xlim(min(distances_in_nm)-10, max(distances_in_nm)+10)

In [None]:
## I want to find the peaks and valleys of the height trace to make measurements from.

## find peaks.
peaks, _  = find_peaks(med_adj_heights)

## find valleys.
valleys, _ = find_peaks(-med_adj_heights)

plt.figure(figsize=(16,4))
plt.plot(distances_in_nm, med_adj_heights, color = "grey")
plt.plot(distances_in_nm[peaks], med_adj_heights[peaks], "+", color = "#D81B60", markersize = 4, label = "Peak in DNA height")
plt.plot(distances_in_nm[valleys], med_adj_heights[valleys], "x", color = "#1E88E5", markersize = 4, label = "Valley in DNA height")
plt.axhline(stats.median(med_adj_heights), color = "black", alpha = 0.5, linestyle = "dashed", label = f"Median height of DNA = {stats.median(heights_in_nm):.2f} nm")
plt.legend(loc = "upper center")
plt.xlim(min(distances_in_nm)-10, max(distances_in_nm)+10)
plt.ylim(-0.4,0.4)
#plt.axvline(80, color = "#FFC107", linestyle = "dotted") # sectioning graph - line 1
#plt.axvline(120, color = "#FFC107", linestyle = "dotted") # sectioning graph - line 2
plt.title("DNA height trace using median adjusted values")
plt.xlabel("Distance from the trace origin (nm)")
plt.ylabel("Change in DNA height from the median")


In [None]:
inter_peak_distance = []
for peak in range(len(peaks) - 1):
    difference = peaks[peak + 1] - peaks[peak]
    inter_peak_distance.append(difference)

print(type(peaks))
print(len(inter_peak_distance))
print(inter_peak_distance)

plt.figure(figsize = (8,4))
plt.hist(inter_peak_distance, bins = 25, color = "white", edgecolor = "#D81B60")
plt.xlabel("Frequency of interpeak distance")
plt.ylabel("Interpeak distance (nm)")
plt.title("Frequency of imterpeak distance")
plt.show()

inter_valley_distance = []
for valley in range(len(valleys) - 1):
    difference = valleys[valley + 1] - valleys[valley]
    inter_valley_distance.append(difference)

print(type(valleys))
print(len(inter_valley_distance))
print(inter_valley_distance)

plt.figure(figsize = (8,4))
plt.hist(inter_valley_distance, bins = 25, color = "white", edgecolor = "#1E88E5")
plt.xlabel("Frequency of inter-valley distance")
plt.ylabel("Inter-valley distance (nm)")
plt.title("Frequency of imter-valley distance")
plt.show()

In [None]:
## if you apply a savgol filter on the data it smoothens the data. You can chose a) the window length of the smoothening and b) the polynominal fit used to fit the data.
## i'm not sure if using the filter is actually useful... I have set the window length to 3 as the distance of major groove to major groove is 3.4 nm then rounded down.

## savgol median adjusted heights
savgol_med_adj_heights = savgol_filter(med_adj_heights, 3, 1)

## find peaks with savgol filter
savgol_peaks, _  = find_peaks(savgol_filter(med_adj_heights, 3, 1))

## find valleys with savgol filter
savgol_valleys, _ = find_peaks(savgol_filter((-med_adj_heights), 3, 1))

plt.figure(figsize=(16,4))
plt.plot(distances_in_nm, savgol_filter(med_adj_heights, 3, 1), color = "grey")
plt.plot(distances_in_nm[savgol_peaks], savgol_med_adj_heights[savgol_peaks], "+", color = "#D81B60", markersize = 4, label = "Peak in DNA height")
plt.plot(distances_in_nm[savgol_valleys], savgol_med_adj_heights[savgol_valleys], "x", color = "#1E88E5", markersize = 4, label = "Valley in DNA height")
plt.axhline(stats.median(med_adj_heights), color = "black", alpha = 0.5, linestyle = "dashed", label = f"Median height of DNA = {stats.median(heights_in_nm):.2f} nm")
plt.legend(loc = "upper center")
plt.xlim(min(distances_in_nm)-10, max(distances_in_nm)+10)
plt.ylim(-0.4,0.4)
highlight_box = patches.Rectangle((80, 120), 0.4, -0.4, linewidth=1, edgecolor='pink', facecolor='lightpink', alpha=0.5)
ax.add_patch(highlight_box)
#plt.axvline(80, color = "#FFC107", linestyle = "dotted") # sectioning graph - line 1
#plt.axvline(120, color = "#FFC107", linestyle = "dotted") # sectioning graph - line 2
#plt.xlim (75, 125)
#plt.legend(loc = "lower left")
plt.title("DNA height trace using median adjusted values AND SavGol filter")
plt.xlabel("Distance from the trace origin (nm)")
plt.ylabel("Change in DNA height from the median")

In [None]:
# Create a figure and subplots grid with 2 rows
fig, axs = plt.subplots(2, figsize=(8, 8))

# Plot data on the first subplot (axs[0])
axs[0].plot(distances_in_nm, med_adj_heights, color="grey")
axs[0].plot(distances_in_nm[peaks], med_adj_heights[peaks], "+", color="#D81B60", markersize=4, label="Peak in DNA height")
axs[0].plot(distances_in_nm[valleys], med_adj_heights[valleys], "x", color="#1E88E5", markersize=4, label="Valley in DNA height")
axs[0].axhline(stats.median(med_adj_heights), color="black", alpha=0.5, linestyle="dashed", label=f"Median height of DNA = {stats.median(heights_in_nm):.2f} nm")
#axs[0].legend(loc="lower center")
axs[0].set_xlim(min(distances_in_nm) - 10, max(distances_in_nm) + 10)
axs[0].set_ylim(-0.2, 0.3)
axs[0].axvline(90, color="#FFC107", linestyle="dotted")
axs[0].axvline(110, color="#FFC107", linestyle="dotted")
axs[0].set_xlim(80, 120)
axs[0].set_title("DNA height trace using median adjusted values")
axs[0].set_xlabel("Distance from the trace origin (nm)")
axs[0].set_ylabel("Change in DNA height from the median")
#highlight_box = patches.Rectangle((90, -0.2), 20, 0.5, linewidth=1, facecolor='#FFC107', alpha=0.2)
#axs[0].add_patch(highlight_box)
highlight_box_2 = patches.Rectangle((97.5, -0.05), 5, 0.15, linewidth = 1, edgecolor = "red", facecolor = "pink", alpha=0.2)
axs[0].add_patch(highlight_box_2)
axs[0].legend(loc="upper right")
# Plot data on the second subplot (axs[1])
axs[1].plot(distances_in_nm, savgol_filter(med_adj_heights, 3, 1), color="grey")
axs[1].plot(distances_in_nm[savgol_peaks], savgol_med_adj_heights[savgol_peaks], "+", color="#D81B60", markersize=4, label="Peak in DNA height")
axs[1].plot(distances_in_nm[savgol_valleys], savgol_med_adj_heights[savgol_valleys], "x", color="#1E88E5", markersize=4, label="Valley in DNA height")
axs[1].axhline(stats.median(med_adj_heights), color="black", alpha=0.5, linestyle="dashed", label=f"Median height of DNA = {stats.median(heights_in_nm):.2f} nm")
axs[1].set_xlim(min(distances_in_nm) - 10, max(distances_in_nm) + 10)
axs[1].set_ylim(-0.2, 0.3)
axs[1].axvline(90, color="#FFC107", linestyle="dotted")
axs[1].axvline(110, color="#FFC107", linestyle="dotted")
axs[1].set_xlim(80, 120)
axs[1].legend(loc="upper right")
axs[1].set_title("DNA height trace using median adjusted values AND SavGol filter")
axs[1].set_xlabel("Distance from the trace origin (nm)")
axs[1].set_ylabel("Change in DNA height from the median")
#highlight_box = patches.Rectangle((90, -0.2), 20, 0.5, linewidth=1, facecolor='#FFC107', alpha=0.2)
#axs[1].add_patch(highlight_box)
highlight_box_2 = patches.Rectangle((97.5, -0.05), 5, 0.15, linewidth = 1, edgecolor = "red", facecolor = "pink", alpha=0.2)
axs[1].add_patch(highlight_box_2)

plt.tight_layout()  # Adjust subplot layout to prevent overlap
plt.show()



In [None]:
print(type(peaks))
print(len(peaks))

print(type(valleys))
print(len(valleys))

In [None]:
print(peaks[0])
print(valleys[0])

print(distances_in_nm[(peaks[0])])
print(heights_in_nm[(peaks[0])])

'''
You use the index of the array to find the location of the data in the heights and distances arrays!
'''

print(len(peaks))
print(peaks)

print(len(valleys))
print(valleys)

## will the length of the peaks and valley arrays alway be equal?

## assuming len(peaks) and len(valleys) are already calculated

if len(peaks) == len(valleys):
    print("Is the length of the peak array equal to the length of the valley array? Yes")
else:
    print("Is the length of the peak array equal to the length of the valley array? No)")


In [None]:
## finding the difference in height between a peak and adjacent valley. Lets use peak 42 as the answer to life, the universe and everything...

print(peaks[42]) # peak
print(valleys[42+1]) # adjacent valley
print(peaks[42] - valleys[42+1])

## These the distance values you moron, you needs the height values.

print(heights_in_nm[peaks[42]])
print(heights_in_nm[peaks[42+1]])
print(heights_in_nm[peaks[42]] - heights_in_nm[peaks[42+1]])

plt.figure(figsize = (16,4))
plt.plot(distances_in_nm, heights_in_nm)
plt.plot(distances_in_nm[peaks[42]],heights_in_nm[peaks[42]], "+",  markersize = 20)
plt.plot(distances_in_nm[peaks[43]],heights_in_nm[peaks[43]], "+",  markersize = 12)
plt.plot(distances_in_nm[peaks[44]],heights_in_nm[peaks[44]], "+",  markersize = 12)
plt.plot(distances_in_nm[peaks[45]],heights_in_nm[peaks[45]], "+",  markersize = 12)
plt.xlim(90,110)

In [None]:
# Generate x values
x = np.linspace(0, 20, 150)

# Generate y values for fake height trace (sine and cosine addition)
y = np.sin(x) + np.cos(2*x)

random_for_y = np.random.uniform(-1/2, 1/2, len(y))
random_y = y + random_for_y
#random_for_x = np.random.uniform(1/10, 1, len(x))
#random_x = x+ random_for_x

# Plot the data
plt.figure(figsize=(10, 6))
plt.plot(x, 2 + y,  color = 'black', label = 'sin(x) + cos(2x)')
plt.plot(x, 2 + random_y, color = 'red', alpha = 0.5, label = 'sin(x) + cos(2x) + random value (-1/2 to 1/2)')
plt.xlabel('x')
plt.ylabel('y')
plt.title('sin(x) + cos(2x) data', style = 'oblique')
plt.legend(loc = 'upper center')
plt.show()

print(stats.median(y))

In [None]:
## This is the same as the double plot above with the labelling boxes removed. I want to try and find differences between the heights of the peaks and valleys.

plt.plot(distances_in_nm, med_adj_heights, color = "grey")
plt.plot(distances_in_nm[peaks[41:44]], med_adj_heights[peaks[41:44]], "+", color="#D81B60", markersize = 14)
plt.plot(distances_in_nm[valleys[43]], med_adj_heights[valleys[43]], "+", color="#1E88E5", markersize = 14)
#plt.plot(distances_in_nm[peaks], med_adj_heights[peaks], "+", color="#D81B60", markersize=8, label="Peak in DNA height")
#plt.plot(distances_in_nm[valleys], med_adj_heights[valleys], "x", color="#1E88E5", markersize=8, label="Valley in DNA height")
#plt.axhline(stats.median(med_adj_heights), color="black", alpha=0.5, linestyle="dashed", label=f"Median height of DNA = {stats.median(heights_in_nm):.2f} nm")
#axs[0].legend(loc="lower center")
plt.xlim(min(distances_in_nm) - 10, max(distances_in_nm) + 10)
plt.ylim(-0.4, 0.4)
#plt.axvline(80, color="#FFC107", linestyle="dotted")
#plt.axvline(120, color="#FFC107", linestyle="dotted")
plt.xlim(90, 110)
plt.title("DNA height trace using median adjusted values")
plt.ylabel("Change in DNA height from the median")
plt.xlabel("Distance from trace origin (nm)")
#highlight_box = patches.Rectangle((90, -0.2), 20, 0.5, linewidth=1, facecolor='#FFC107', alpha=0.2)
#plt.gca().add_patch(highlight_box)
#highlight_box_2 = patches.Rectangle((97.5, -0.05), 5, 0.15, linewidth = 1, edgecolor = "red", facecolor = "pink", alpha=0.2)
#plt.gca().add_patch(highlight_box_2)

In [None]:
print("The heights of the peaks of interest are:", (med_adj_heights[peaks[41:44]]))
print("The height of the valley of interest is:", (med_adj_heights[valleys[43]]))

diff_peaks_valleys = np.array((med_adj_heights[peaks[41:44]] - (med_adj_heights[valleys[43]])))
print(diff_peaks_valleys)

print(med_adj_heights[peaks[41:44]] - np.absolute(diff_peaks_valleys))

print(distances_in_nm[peaks[43]])
print(distances_in_nm[valleys[43]])

plt.figure(figsize = (6,4))
plt.plot(distances_in_nm, med_adj_heights, color = "grey", label = "DNA height trace")
#plt.plot(distances_in_nm[peaks[41:44]], (med_adj_heights[peaks[41:44]] - diff_peaks_valleys))
plt.plot(distances_in_nm[peaks[41:44]], med_adj_heights[peaks[41:44]], "+", color = "#D81B60", markersize = 12, label = "Peak in DNA height")
plt.plot(distances_in_nm[valleys[41:44]], med_adj_heights[valleys[41:44]], "x", color = "#1E88E5", markersize = 12, label = "Valley in DNA height")
#plt.xlim(min(distances_in_nm) - 10, max(distances_in_nm) + 10)
plt.ylim(-0.2, 0.3)
plt.xlim(90, 110)

plt.title("DNA height trace using median adjusted values")
plt.ylabel("Change in DNA height from the median")
plt.xlabel("Distance from trace origin (nm)")

plt.vlines(95.164, med_adj_heights[valleys[41]], 0.22, color = "red", linestyle = "-.", alpha = 1.0, label = "∆ value")
plt.hlines(med_adj_heights[valleys[41]], distances_in_nm[peaks[41]], distances_in_nm[valleys[41]], color = "red", linestyle = "-", alpha = 1.0)

plt.vlines(99.55, med_adj_heights[valleys[42]], 0.058, color = "red", linestyle = ":", alpha = 1.0, label = "∆ value")
plt.hlines(med_adj_heights[valleys[42]], distances_in_nm[peaks[42]], distances_in_nm[valleys[42]], color = "red", linestyle = "-", alpha = 1.0)

plt.vlines(100.49, med_adj_heights[valleys[43]], 0.0167, color = "red", linestyle = "--", alpha = 1.0, label = "∆ value")
plt.hlines(med_adj_heights[valleys[43]], distances_in_nm[peaks[43]], distances_in_nm[valleys[43]], color = "red", linestyle = "-", alpha = 1.0)

highlight_box_2 = patches.Rectangle((97.5, -0.05), 5, 0.15, linewidth = 1, edgecolor = "red", facecolor = "pink", alpha=0.2)
plt.gca().add_patch(highlight_box_2)

plt.legend(loc = "upper right")


In [None]:
plt.figure(figsize = (6,4))
plt.plot(distances_in_nm, savgol_filter(savgol_med_adj_heights, 3,1), color = "grey", label = "DNA height trace")
#plt.plot(distances_in_nm[peaks[41:44]], (med_adj_heights[peaks[41:44]] - diff_peaks_valleys))
plt.plot(distances_in_nm[savgol_peaks[24]], savgol_med_adj_heights[savgol_peaks[24]], "+", color = "#D81B60", markersize = 12, label = "Peak in DNA height")
plt.plot(distances_in_nm[savgol_valleys[24]], savgol_med_adj_heights[savgol_valleys[24]], "x", color = "#1E88E5", markersize = 12, label = "Valley in DNA height")
#plt.xlim(min(distances_in_nm) - 10, max(distances_in_nm) + 10)
plt.ylim(-0.2, 0.3)
plt.xlim(90, 110)

plt.title("DNA height trace using median adjusted values AND SavGol filter \n ") ## you can split this title over several lines using " \n "
plt.ylabel("Change in DNA height from the median")
plt.xlabel("Distance from trace origin (nm)")

plt.vlines(95.164, med_adj_heights[savgol_valleys[24]], 0.22, color = "red", linestyle = "dashed", alpha = 1.0)
plt.hlines(med_adj_heights[savgol_valleys[24]], distances_in_nm[savgol_peaks[24]], distances_in_nm[savgol_valleys[24]], color = "red", linestyle = "dashed", alpha = 1.0, label = "∆ value")
highlight_box_2 = patches.Rectangle((97.5, -0.05), 5, 0.15, linewidth = 1, edgecolor = "red", facecolor = "pink", alpha=0.2)
plt.gca().add_patch(highlight_box_2)
'''
plt.vlines(99.55, med_adj_heights[valleys[42]], 0.058, color = "red", linestyle = "dotted", alpha = 0.75)
plt.hlines(med_adj_heights[valleys[42]], distances_in_nm[peaks[42]], distances_in_nm[valleys[42]], color = "red", linestyle = "dotted", alpha = 0.5)

plt.vlines(100.49, med_adj_heights[valleys[43]], 0.0167, color = "red", linestyle = "dotted", alpha = 0.75)
plt.hlines(med_adj_heights[valleys[43]], distances_in_nm[peaks[43]], distances_in_nm[valleys[43]], color = "red", linestyle = "dotted", alpha = 0.5)
'''
plt.legend(loc = "upper right")

In [None]:
## what it would be good to do here is work out the difference between the peak and valley in terms of nanometers to see how many points become ommitted...? This is akin to working out the rate of change of height...

height_threshold = 0.2
peak_vs_valley = round(savgol_med_adj_heights[savgol_peaks[24]] - (savgol_med_adj_heights[savgol_valleys[24]]),2)
if peak_vs_valley >= height_threshold:
    print("∆ peak vs. valley is", peak_vs_valley, "nm, therefore DOES EXCEED the height threshold of", height_threshold, "for a defect in the DNA")
else:
    print("∆ peak vs. valley is", peak_vs_valley, "and therefore DOES NOT EXCEED the height threshold of", height_threshold, "nm")

print(savgol_med_adj_heights[savgol_peaks[24]])
print(savgol_med_adj_heights[savgol_valleys[24]])

In [None]:
'''
Okay, okay... so this is 'cool' but I had another thought, if the trace goes down it has to come back
up so we could look at the area of the 'triangle' that is formed by the two adjacent peaks and the valley in between?
'''

## This is the same as the double plot above with the labelling boxes removed. I want to try and find differences between the heights of the peaks and valleys.

plt.plot(distances_in_nm, med_adj_heights, color = "grey")
plt.plot(distances_in_nm[peaks], med_adj_heights[peaks], "+", color="#D81B60", markersize = 14)
plt.plot(distances_in_nm[valleys], med_adj_heights[valleys], "+", color="#1E88E5", markersize = 14)
plt.xlim(min(distances_in_nm) - 10, max(distances_in_nm) + 10)
plt.ylim(-0.4, 0.2)
plt.xlim(180, 230)
plt.title("DNA height trace using median adjusted values")
plt.ylabel("Change in DNA height from the median")
plt.xlabel("Distance from trace origin (nm)")
plt.show()

plt.plot(distances_in_nm, savgol_med_adj_heights, color = "grey")
plt.plot(distances_in_nm[savgol_peaks], savgol_med_adj_heights[savgol_peaks], "+", color="#D81B60", markersize = 14)
plt.plot(distances_in_nm[savgol_valleys], savgol_med_adj_heights[savgol_valleys], "+", color="#1E88E5", markersize = 14)
plt.xlim(min(distances_in_nm) - 10, max(distances_in_nm) + 10)
plt.ylim(-0.4, 0.2)
plt.xlim(180, 230)
plt.title("DNA height trace using median adjusted values AND SavGol filter")
plt.ylabel("Change in DNA height from the median")
plt.xlabel("Distance from trace origin (nm)")
plt.show()

## look at these graphs right, you can see that the valley on the right would make a decent isosceles triangle
## once the SavGol filter has been applied which would separate it from the left hand side valley...


In [None]:
# Define the coordinates of the triangle vertices
x = [1, 3, 5]  # X-coordinates of the vertices
y = [2, 5, 3]  # Y-coordinates of the vertices

# Plot the lines between the vertices to form the triangle
plt.plot(x + [x[0]], y + [y[0]], marker='x')  # Connect last point to first point to close the triangle

# Add labels and title
plt.xlabel('Distance')
plt.ylabel('Height')
plt.title('Triangle Plot')

# Display the plot
plt.grid(True)
plt.show()


w## whatever
