In [None]:
import json
import networkx as nx
from networkx.classes.function import edges


def read_json_data(path):
  with open(path) as f:
    # Load the contents of the file into a variable
    data = f.read()
    json_data = json.loads(data)
  return json_data


def create_positive_graph(json_data):
  # Create an empty graph
  G = nx.DiGraph()
  # Add nodes to the graph
  for node in json_data["node"]:
    G.add_node(int(node["id"]), seq=node["sequence"])
  # Add edges to the graph
  for edge in json_data["edge"]:
    G.add_edge(int(edge["from"]), int(edge["to"]))
  json_data.clear()
  return G


def reverse_complement(seq):
  reverse_compl_seq = ""
  for c in seq:
    if c == 'A':
      reverse_compl_seq += 'T'
    elif c == 'T':
      reverse_compl_seq += 'A'
    elif c == 'C':
      reverse_compl_seq += 'G'
    elif c == 'G':
      reverse_compl_seq += 'C'
    elif c == 'N':
      reverse_compl_seq += 'N'
  reverse_compl_seq = reverse_compl_seq[::-1]

  return reverse_compl_seq


def create_negative_graph(G_positive):
  G_negative = G_positive.reverse()
  for node in G_negative.nodes():
    G_negative.nodes[node]["seq"] = reverse_complement(G_negative.nodes[node]["seq"])
  return G_negative


def find_pam_nodes(G, PAM):
  gg_nodes = []
  gg_splitted_nodes = []
  for node in G.nodes():
    if PAM[-2:] in G.nodes[node]["seq"]:
      gg_nodes.append(node)
    if G.nodes[node]["seq"].startswith(PAM[-1]):
      # iterate over neighbors
      for neighbor in G.predecessors(node):
        if G.nodes[neighbor]['seq'].endswith(PAM[-2]):
          gg_splitted_nodes.append(node)

  joined_gg_nodes = gg_nodes + list(set(gg_splitted_nodes) - set(gg_nodes))

  gg_splitted_nodes.clear()
  gg_nodes.clear()
  return joined_gg_nodes


#function to extract the subgraph contined the PAM.
def PAM_DFS(G, PAM_node, PAM, max_seq_depth):
  #trash_len = (lenGuida + NBulge) * MAxNumberBulgeConsidered = (20 + 0) * 4
  nodes = []
  condition = False
  trash_len = max_seq_depth - G.nodes[PAM_node]["seq"].find(PAM[-2:])
  nodes.append(PAM_node)
  for node in nodes:
    for neighbor in G.predecessors(node):
      if trash_len - len(G.nodes[neighbor]["seq"]) > 0 or trash_len > 20 :
        if neighbor not in nodes:
          trash_len -= len(G.nodes[neighbor]["seq"])
          nodes.append(neighbor)
      else:
        condition = True
        break
    if condition:
      break

  return nodes


#function that retunr the edges needed for the function find_all_paths
def extract_edges(G, nodes):
  edges = []
  if len(nodes) > 1:
    for n1 in nodes:
      for n2 in nodes:
        if n1 != n2 and G.has_edge(n1, n2):
          edge = (n1, n2)
          edges.append(edge)

  return edges


# Define a function to generate all paths from a list of edges
def find_all_paths(edges, target_node):
    # Create an empty graph and add the edges to it
    graph = nx.DiGraph()
    graph.add_edges_from(edges)
    # Generate all paths from the graph
    all_paths = []
    starts_nodes = []
    #Da qua faccio il all_simple_path con tutti i nodi che non hanno un arco entrante quindi questi nodi sono quelli
    #che vanno nello start_node
    for node in graph.nodes():
      if graph.in_degree(node) == 0:
        starts_nodes.append(node)
    for start_node in starts_nodes:
      for path in nx.all_simple_paths(graph, source = start_node, target = target_node):
        all_paths.append(path)
    graph.clear()

    return all_paths


def paths_on_strand(G, pam_nodes, PAM, max_seq_depth):
  all_paths = []
  for i in range(len(pam_nodes)):
    target_node = pam_nodes[i]
    nodes = PAM_DFS(G, target_node, PAM, max_seq_depth)
    edges = extract_edges(G, nodes)

    if len(nodes) == 1:
      all_paths.append([nodes])
    else:
      all_paths.append(find_all_paths(edges, target_node))

  return all_paths


def search_PAM_positions(seq, PAM):
  positions = []
  start = 0

  while True:
    index = seq.find(PAM[-2:], start)
    if index == -1:
        break
    if index > 20:
      positions.append(index - 21)
    start = index + 1

  return positions


def count_mismatch(guide, seq, positions, strand, path, seq_nodes):
  max_miss = 4
  results = []
  for p in positions:
    index_guide = 0
    count_miss = max_miss
    result = []
    seq_match = ""
    while p < p + 20  and index_guide < len(guide): #ex: while p < p + 20  and index_guide < len(guide):
      if seq[p] != guide[index_guide]:
        seq_match += seq[p].lower()
        count_miss -= 1
      else:
        seq_match += seq[p]
      p += 1
      index_guide += 1
    if count_miss >= 0:
      seq_match += seq[p]
      seq_match += seq[p + 1]
      seq_match += seq[p + 2]
      result = {
      'path' : path,
      'seq_nodes' : seq_nodes,
      'seq': seq,
      'seq_match': seq_match,
      'mismatches': max_miss - count_miss,
      'start': p - 20,
      'strand': strand
      }
      #result = filter(result)
      results.append(result)
  return results


def filter(result):
  nodes = result['seq_nodes']
  result_filtered = None
  for i in range(len(nodes)):
    if result['seq_match'].upper() in nodes[i]:
      result_filtered = {
          'path' : [result['path'][i]],
          'seq_nodes' : [nodes[i]],
          'seq': result['seq'],
          'seq_match': result['seq_match'],
          'mismatches': result['mismatches'],
          'start': result['start'],
          'strand': result['strand']
      }
      break
  if result_filtered is not None:
    return result_filtered
  else:
    return result


def compare_with_guide(G, all_paths, sgRNA_guide, PAM, strand):
  final_results = []
  result = []
  unique_results = []
  guide = sgRNA_guide
  for paths in all_paths: #list of paths
    for path in paths: #list of nodes
      seq = ""
      seq_nodes = []
      for node in path: #each single node in the path
        seq += G.nodes[node]["seq"]
        seq_nodes.append(G.nodes[node]["seq"])
      positions = search_PAM_positions(seq, PAM)
      result = count_mismatch(guide, seq, positions, strand, path, seq_nodes)
      if result:
        final_results.append(result)
  #unique_results = unique_extraction(final_results)
  return unique_results


def unique_extraction(final_results):
  unique_list = []
  for i in range(len(final_results)):
      path = final_results[i][0]['path']
      if path not in [item[0]['path'] for item in unique_list]:
          unique_list.append(final_results[i])
  return unique_list


In [None]:
import json
import networkx as nx

json_data = read_json_data('/content/drive/MyDrive/chr_22.json')
G_positive = create_positive_graph(json_data)
G_positive.add_edge(3760415,1094873)
G_positive.add_edge(1094870,3760415)
G_negative = create_negative_graph(G_positive)
PAM = "NGG"

positive_pam_nodes = find_pam_nodes(G_positive, PAM)
all_positive_paths = paths_on_strand(G_positive, positive_pam_nodes, PAM, 30)
positive_strand_results = compare_with_guide(G_positive, all_positive_paths, "GAGTCCGAGCAGAAGAAGAA", PAM, '+')

for result in positive_strand_results:
  print(result)

negative_pam_nodes = find_pam_nodes(G_negative, PAM)
all_negative_paths = paths_on_strand(G_negative, negative_pam_nodes, PAM, 30)
negative_strand_results = compare_with_guide(G_negative, all_negative_paths, "GAGTCCGAGCAGAAGAAGAA", PAM, '-')

for result in negative_strand_results:
  print(result)

[{'path': [3200439, 3200440], 'seq_nodes': ['G', 'AGTCCGTGTAGAAGCAGAGGGGCTGTACAGCT'], 'seq': 'GAGTCCGTGTAGAAGCAGAGGGGCTGTACAGCT', 'seq_match': 'GAGTCCGtGtAGAAGcAGAgGGG', 'mismatches': 4, 'start': 0, 'strand': '+'}]
[{'path': [1094870, 1094872, 1094873], 'seq_nodes': ['GAGTCCCAGCAGAAGGA', 'C', 'GAGGGGCTGTGA'], 'seq': 'GAGTCCCAGCAGAAGGACGAGGGGCTGTGA', 'seq_match': 'GAGTCCcAGCAGAAGgAcgAGGG', 'mismatches': 4, 'start': 0, 'strand': '+'}]
[{'path': [1094870, 1094871, 1094873], 'seq_nodes': ['GAGTCCCAGCAGAAGGA', 'G', 'GAGGGGCTGTGA'], 'seq': 'GAGTCCCAGCAGAAGGAGGAGGGGCTGTGA', 'seq_match': 'GAGTCCcAGCAGAAGgAGgAGGG', 'mismatches': 3, 'start': 0, 'strand': '+'}]
[{'path': [1094870, 3760415, 1094873], 'seq_nodes': ['GAGTCCCAGCAGAAGGA', 'G', 'GAGGGGCTGTGA'], 'seq': 'GAGTCCCAGCAGAAGGAGGAGGGGCTGTGA', 'seq_match': 'GAGTCCcAGCAGAAGgAGgAGGG', 'mismatches': 3, 'start': 0, 'strand': '+'}]
[{'path': [1041825, 1041826], 'seq_nodes': ['CAGGGCTGGGTGACACCTGGGAGAAGAGTCCA', 'GGCAGAAGGAGCAGGGAAGGTGAGGTCCTTGG'], 's