Skip to content

Commit

Permalink
feat: data-structure speed-bump
Browse files Browse the repository at this point in the history
  • Loading branch information
BenTenmann committed Dec 5, 2021
1 parent ec4320d commit 25127f3
Show file tree
Hide file tree
Showing 9 changed files with 54 additions and 16 deletions.
4 changes: 2 additions & 2 deletions include/setriq/alignment/SubstitutionMatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ class SubstitutionMatrix {
SubstitutionMatrix() : subMatrix(), edgeMap() {};
SubstitutionMatrix(const doubleMatrix&, const stringIndexMap&);

double forward (const std::string&, const std::string&);
double operator () (const std::string& a, const std::string &b) { return this->forward(a, b); };
double forward (const char&, const char&);
double operator () (const char& a, const char &b) { return this->forward(a, b); };
};


Expand Down
4 changes: 2 additions & 2 deletions include/setriq/utils/typeDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
#ifndef METRICS_TYPEDEFS_H
#define METRICS_TYPEDEFS_H

#include <map>
#include <string>
#include <unordered_map>
#include <vector>

typedef std::vector<std::string> stringVector;
typedef std::vector<double> doubleVector;
typedef std::vector<doubleVector> doubleMatrix;
typedef std::map<std::string, size_t> stringIndexMap;
typedef std::unordered_map<char, size_t> stringIndexMap;

#endif //METRICS_TYPEDEFS_H
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def get_link_args():
install_requires=[
'glom>=20.0.0,<21.0.0',
'numpy>=1.0.0,<2.0.0',
'pandas>=1.0.0,<2.0.0',
'srsly>=2.0.0,<3.0.0',
],
package_dir={f'{PROJECT_NAME}': f'{SOURCE_DIR}/{PROJECT_NAME}'},
Expand Down
5 changes: 2 additions & 3 deletions src/setriq/_C/alignment/SmithWaterman.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ double SmithWaterman::fillScoringMatrix(doubleMatrix &scoringMatrix,

for (size_t i = 1; i < N; i++) {
for (size_t j = 1; j < M; j++) {
alignmentScore = scoringMatrix[i - 1][j - 1] + this->substitutionMatrix(a.substr(i - 1, 1),
b.substr(j - 1, 1));
alignmentScore = scoringMatrix[i - 1][j - 1] + this->substitutionMatrix(a[i - 1], b[j - 1]);
kGapScore = this->calculateGapPenalty(scoringMatrix, i, j, kAxis);
lGapScore = this->calculateGapPenalty(scoringMatrix, j, i, lAxis);

Expand Down Expand Up @@ -74,7 +73,7 @@ double SmithWaterman::identityScore(const std::string &inputString) {

double score {0};
for (size_t i = 0; i < N; i++) {
score += this->substitutionMatrix(inputString.substr(i, 1), inputString.substr(i, 1));
score += this->substitutionMatrix(inputString[i], inputString[i]);
}
return score;
}
2 changes: 1 addition & 1 deletion src/setriq/_C/alignment/SubstitutionMatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ SubstitutionMatrix::SubstitutionMatrix(const doubleMatrix& matrix, const stringI
this->edgeMap = index;
}

double SubstitutionMatrix::forward(const std::string &from, const std::string &to) {
double SubstitutionMatrix::forward(const char &from, const char &to) {
size_t fromIdx, toIdx;
fromIdx = this->edgeMap[from];
toIdx = this->edgeMap[to];
Expand Down
13 changes: 13 additions & 0 deletions src/setriq/_C/metrics/CdrDist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,20 @@ metric::CdrDist::CdrDist(const doubleMatrix& matrix, const stringIndexMap& index
}

double metric::CdrDist::forward(const std::string &a, const std::string &b) {
/**
* Compute the CdrDist metric between two input strings.
*
* @param a: on input string to be compared
* @param b: second input string to be compared
* @return the CdrDist metric between the input strings
*/

// calculate the numerator of CdrDist. This is the most expensive operation of the metric, as the sequences get
// aligned
double abScore = this->algorithm(a, b);

// when a == b in SW, the best score collapses down to a simple arithmetic sum over all the amino acid positions
// this gives a significant speed bump
double aaScore = this->algorithm.identityScore(a);
double bbScore = this->algorithm.identityScore(b);

Expand Down
33 changes: 27 additions & 6 deletions src/setriq/_C/metrics/TcrDist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,21 @@
#include <stdexcept>
#include "metrics/TcrDist.h"

metric::TcrDist::TcrDist(const doubleMatrix& subMat, const stringIndexMap& index, double gapPen, char gapSym, double weight) {
metric::TcrDist::TcrDist(const doubleMatrix& subMat,
const stringIndexMap& index,
double gapPen,
char gapSym,
double weight) {
/**
* Initialize the TcrDist object.
*
* @param subMat: the substitution scoring matrix
* @param index: the token index
* @param gapPen: the metric gap penalty
* @param gapSym: the gap symbol to be used (e.g. "-")
* @param weight: the weight of the metric component output
*/

SubstitutionMatrix sm {subMat, index};
this->substitutionMatrix = sm;
this->gapPenalty = gapPen;
Expand All @@ -14,19 +28,26 @@ metric::TcrDist::TcrDist(const doubleMatrix& subMat, const stringIndexMap& index
}

double metric::TcrDist::forward(const std::string &a, const std::string &b) {
if (a.size() != b.size()) throw std::invalid_argument("input strings must be of equal length.");
/**
* Compute the TcrDist metric (Dash et al) between two provided sequences. Be sure to provide sequences of equal
* length. Errors are handled in the Python API for speed and interpretability considerations.
*
* @param a: a string to be compared
* @param b: another string to be compared
* @return TcrDist metric between the two strings
*/

double distance {0}, substitution;
for (size_t i = 0; i < a.size(); i++) {
if (a[i] == b[i]) continue;

if (a[i] == this->gapSymbol || b[i] == this->gapSymbol) {
distance += this->gapPenalty * this->distanceWeight;
distance += this->gapPenalty;
continue;
}

substitution = this->substitutionMatrix(a.substr(i, 1), b.substr(i, 1));
distance += std::min(4., 4. - substitution) * this->distanceWeight;
substitution = this->substitutionMatrix(a[i], b[i]);
distance += std::min(4., 4. - substitution);
}
return distance;
return distance * this->distanceWeight;
}
7 changes: 5 additions & 2 deletions src/setriq/modules/_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import Dict, List

import numpy as np
import pandas as pd
from glom import glom

import setriq._C as C
Expand Down Expand Up @@ -98,6 +99,8 @@ def __init__(self,
self.fn = C.tcr_dist_component

def forward(self, sequences: List[str]) -> List[float]:
if not (len(sequences[0]) == pd.Series(sequences).str.len()).all():
raise ValueError('Sequences must be of equal length')
out = self.fn(sequences, **self.call_args)

return out
Expand Down Expand Up @@ -178,7 +181,7 @@ def __init__(self, **components):
for name, component in components.items():
# some type checking
if not isinstance(component, TcrDistComponent):
raise TypeError(f'{repr(name)} is not of type {TcrDistComponent}')
raise TypeError(f'{repr(name)} is not of type {TcrDistComponent.__class__.__name__}')

self.__setattr__(name, component)
parts.append(name)
Expand All @@ -199,7 +202,7 @@ def _check_input_format(self, ipt):

diff = pts.difference(ipt)
if diff:
raise ValueError('please inspect inputs - missing key(s): {}'.format(', '.join(map(repr, diff))))
raise ValueError('Missing key(s): {}'.format(', '.join(map(repr, diff))))

@property
def required_input_keys(self) -> List[str]:
Expand Down
1 change: 1 addition & 0 deletions src/setriq/modules/_substitution.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class SubstitutionMatrix(abc.ABC):
here we can see that we can provide any arbitrary substitution matrix, but in general it is advised to use the
pre-loaded BLOSUM matrices
>>> [BLOSUM45, BLOSUM62, BLOSUM90] # choose one of the following
these are just instances of `SubstitutionMatrix`, initialised through `from_json`
"""
_required_keys = (
Expand Down

0 comments on commit 25127f3

Please sign in to comment.