In [1]:
# Allow src folder to be imported from this notebook
import sys
from pathlib import Path

module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

___

In [2]:
import json
import gurobipy as gp
from gurobipy import GRB
from src.blocks import Block
from src.msa import AnalyzerMSA

In [3]:
# MSA
NAME_MSA = "Cluster_6892"
amsa = AnalyzerMSA()
path_msa = f"../msas/{NAME_MSA}.fa"
align, n_seqs, n_cols = amsa.load_msa(path_msa)
n_seqs, n_cols

(3, 158)

**Blocks for each position (r,c) in the MSA** 

$({r},c,c,MSA[r,c])$

In [5]:
blocks_one_char = []
for col in range(n_cols):
    for row in range(n_seqs):
        blocks_one_char.append(
            Block(K=(row,), i=col, j=col, label=align[row,col])
        )

blocks_one_char[:3]

[Block(K=(0,), i=0, j=0, label='C'),
 Block(K=(1,), i=0, j=0, label='-'),
 Block(K=(2,), i=0, j=0, label='-')]

**Set of blocks from the  decomposition**

In [6]:
# Load set of decomposed blocks
path_blocks = f"../experiment/block_decomposition/{NAME_MSA}.json"

with open(path_blocks) as fp:
    decomposed_blocks = [Block(*block) for block in json.load(fp)] 

decomposed_blocks[:3]

[Block(K=(0, 1, 2), i=96, j=153, label='TAGTGAGTACGGAGAAAATCCTCGTGGGAAAGTATAAAAGATTCTTTTTGAGGTTGTC'),
 Block(K=(0, 1), i=83, j=153, label='AGGAAATTGTGAGTAGTGAGTACGGAGAAAATCCTCGTGGGAAAGTATAAAAGATTCTTTTTGAGGTTGTC'),
 Block(K=(0, 1), i=66, j=76, label='GAGTTTTCACG')]

In [7]:
# set B: input blocks (maximal blocks, the decompositions under intersection by pairs and blocks of one position in the MSA)
set_B = decomposed_blocks + blocks_one_char

# write idx for blocks 
blocks = [(",".join([str(r) for r in b.K]), b.i, b.j) for b in set_B] # (K,i,j)

# write idx for MSA positions (row, col)
msa_positions = [(r,c) for r in range(n_seqs) for c in range(n_cols)] 

# dictionary to store the string of each block indexed as Gurobipy uses it
strings_ = { (",".join([str(_) for _ in b.K]), b.i, b.j): b.label for b in set_B}

In [9]:
def check_intersection(block1,block2):
    "intersection of blocks "
    block1_K = block1[0].split(",")
    block2_K = block2[0].split(",")

    # check for not empty intersection, otherwise skip to the next block1 in the list
    common_rows = list(set(block1_K).intersection(set(block2_K))) # intersection set K
    common_cols = list(set(range(block1[1],block1[2]+1)).intersection(set(range(block2[1],block2[2]+1)))) # intersection columns [i,j]

    return True if common_rows and common_cols else False

# example
block1 = ("1,2",1,1)
block2 = ("3",1,3)
check_intersection(block1, block2)

False

In [10]:
# Create the model
model = gp.Model("pangeblocks")

# define variables
C = model.addVars(blocks, vtype=GRB.BINARY, name="C")
U = model.addVars(msa_positions, vtype=GRB.BINARY, name="U")

# Constraints: 
for r,c in msa_positions:

    # subset of blocks that covers the position [r,c]
    subset_C = [ C[K,i,j] for K,i,j in blocks if str(r) in K.split(",") and i<=c<=j ]
    if len(subset_C):
        # print(f"{len(subset_C)} blocks cover the position {(r,c)}")
        
        ## 1. each position in the MSA is covered ONLY ONCE
        model.addConstr( U[r,c] <= sum(subset_C), name=f"constraint1({r},{c})")
        
        ## 2. each position of the MSA is covered AT LEAST by one block
        model.addConstr( U[r,c] >= 1, name=f"constraint2({r},{c})")


## 3. overlapping blocks cannot be chosen
# sort all blocks, 
blocks = sorted(blocks, key=lambda b: b[1]) # sort blocks by the starting position (K,start,end)

# and analyze the intersections while update the constraints
names_constraint3=[]
for pos1,block1 in enumerate(blocks[:-1]):
    # compare against the next blocks in the sorted list
    for rel_pos, block2 in enumerate(blocks[pos1+1:]):
        pos2 = rel_pos + pos1 + 1
        block2 = blocks[pos2]
        
        # check for not empty intersection, otherwise, skip to the next block  
        # note: set K is a string with the rows concatenated by a "," (due to Gurobi requirements for index the variables)
        block1_K = block1[0].split(",")
        block2_K = block2[0].split(",")

        # check for not empty intersection, otherwise skip to the next block1 in the list
        common_rows = list(set(block1_K).intersection(set(block2_K))) # intersection set K
        common_cols = list(set(range(block1[1],block1[2]+1)).intersection(set(range(block2[1],block2[2]+1)))) # intersection columns [i,j]

        if (common_rows and common_cols):
            
            # if the blocks intersect, then create the restriction 
            K1,i1,j1=block1
            K2,i2,j2=block2
            name_constraint=f"constraint3({K1},{i1},{j1})-({K2},{i2},{j2})"
            model.addConstr(C[block1] + C[block2] <= 1 , name=name_constraint)
            names_constraint3.append(name_constraint)

# Objective function
model.setObjective(C.sum('*','*','*'), GRB.MINIMIZE)

model.optimize()

Path("ilp-models").mkdir(exist_ok=True)
model.write("ilp-models/model1.lp")

Set parameter Username
Academic license - for non-commercial use only - expires 2023-10-29
Gurobi Optimizer version 9.5.2 build v9.5.2rc0 (linux64)
Thread count: 32 physical cores, 64 logical processors, using up to 32 threads
Optimize a model with 2650 rows, 988 columns and 6397 nonzeros
Model fingerprint: 0x0a2c1d3f
Variable types: 0 continuous, 988 integer (988 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  Objective range  [1e+00, 1e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 1e+00]
Found heuristic solution: objective 474.0000000
Presolve removed 2641 rows and 979 columns
Presolve time: 0.05s
Presolved: 9 rows, 9 columns, 38 nonzeros
Found heuristic solution: objective 78.0000000
Variable types: 0 continuous, 9 integer (9 binary)

Root relaxation: cutoff, 1 iterations, 0.00 seconds (0.00 work units)

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  Depth IntInf | Incumbent    BestBd   Gap | It/N

In [11]:
"constraint3(1,2,0,3)-(0,1,1,5)" in names_constraint3

False

In [23]:
solution_C = model.getAttr("X", C)
solution_U = model.getAttr("X",U)
len(solution_C)>0, len(solution_U)>0

(True, True)

In [18]:
used_blocks = []
for k,v in solution_C.items(): 
    K,i,j=k
    if v > 0:
        used_blocks.append(
            Block(eval(f"({K},)"),i,j, strings_[K,i,j])
        )

In [22]:
sorted_solution=sorted(used_blocks, key=lambda b: b.i)
len(sorted_solution)

78

___

In [None]:
constraint = model.getConstrByName("constraint2(1,28)")
print(f"{model.getRow(constraint)} {constraint.Sense} {constraint.RHS}")

In [None]:
solution_C["0,1",27,31], solution_C["1,2",28,45]

In [None]:
sum(len(b.label)*len(b.K) for b in used_blocks)

In [None]:
solution_U = model.getAttr("X", U)
len([k for k,v in solution_U.items() if v == 0])

In [None]:
len([k for k,v in solution_U.items() if v > 1])

In [None]:
solution_U[1,28]

In [None]:
# number of variables
len(blocks) + len(msa_positions)

In [None]:
cc = model.getConstrByName("constraint1(2,155)")
cc

In [None]:
[ C[K,i,j] for K,i,j in blocks if str(r) in K.split(",") and i<=c<=j ]

In [None]:
K,i,j=blocks[0]


In [None]:
r,c = msa_positions[98]

In [None]:
str(r) in K.split(",") and i <=c<=j

___