diff --git a/include/setriq/alignment/SubstitutionMatrix.h b/include/setriq/alignment/SubstitutionMatrix.h index d3aa7d0..dca0457 100644 --- a/include/setriq/alignment/SubstitutionMatrix.h +++ b/include/setriq/alignment/SubstitutionMatrix.h @@ -17,8 +17,8 @@ class SubstitutionMatrix { SubstitutionMatrix() : subMatrix(), edgeMap() {}; SubstitutionMatrix(const doubleMatrix&, const stringIndexMap&); - double forward (const std::string&, const std::string&); - double operator () (const std::string& a, const std::string &b) { return this->forward(a, b); }; + double forward (const char&, const char&); + double operator () (const char& a, const char &b) { return this->forward(a, b); }; }; diff --git a/include/setriq/utils/typeDefs.h b/include/setriq/utils/typeDefs.h index 5bb24da..284c039 100644 --- a/include/setriq/utils/typeDefs.h +++ b/include/setriq/utils/typeDefs.h @@ -5,13 +5,13 @@ #ifndef METRICS_TYPEDEFS_H #define METRICS_TYPEDEFS_H -#include #include +#include #include typedef std::vector stringVector; typedef std::vector doubleVector; typedef std::vector doubleMatrix; -typedef std::map stringIndexMap; +typedef std::unordered_map stringIndexMap; #endif //METRICS_TYPEDEFS_H diff --git a/setup.py b/setup.py index bfff24d..bc67687 100644 --- a/setup.py +++ b/setup.py @@ -89,6 +89,7 @@ def get_link_args(): install_requires=[ 'glom>=20.0.0,<21.0.0', 'numpy>=1.0.0,<2.0.0', + 'pandas>=1.0.0,<2.0.0', 'srsly>=2.0.0,<3.0.0', ], package_dir={f'{PROJECT_NAME}': f'{SOURCE_DIR}/{PROJECT_NAME}'}, diff --git a/src/setriq/_C/alignment/SmithWaterman.cpp b/src/setriq/_C/alignment/SmithWaterman.cpp index 4984f98..fef3a1a 100644 --- a/src/setriq/_C/alignment/SmithWaterman.cpp +++ b/src/setriq/_C/alignment/SmithWaterman.cpp @@ -23,8 +23,7 @@ double SmithWaterman::fillScoringMatrix(doubleMatrix &scoringMatrix, for (size_t i = 1; i < N; i++) { for (size_t j = 1; j < M; j++) { - alignmentScore = scoringMatrix[i - 1][j - 1] + this->substitutionMatrix(a.substr(i - 1, 1), - b.substr(j - 1, 1)); + alignmentScore = scoringMatrix[i - 1][j - 1] + this->substitutionMatrix(a[i - 1], b[j - 1]); kGapScore = this->calculateGapPenalty(scoringMatrix, i, j, kAxis); lGapScore = this->calculateGapPenalty(scoringMatrix, j, i, lAxis); @@ -74,7 +73,7 @@ double SmithWaterman::identityScore(const std::string &inputString) { double score {0}; for (size_t i = 0; i < N; i++) { - score += this->substitutionMatrix(inputString.substr(i, 1), inputString.substr(i, 1)); + score += this->substitutionMatrix(inputString[i], inputString[i]); } return score; } diff --git a/src/setriq/_C/alignment/SubstitutionMatrix.cpp b/src/setriq/_C/alignment/SubstitutionMatrix.cpp index 6315746..e3a2655 100644 --- a/src/setriq/_C/alignment/SubstitutionMatrix.cpp +++ b/src/setriq/_C/alignment/SubstitutionMatrix.cpp @@ -9,7 +9,7 @@ SubstitutionMatrix::SubstitutionMatrix(const doubleMatrix& matrix, const stringI this->edgeMap = index; } -double SubstitutionMatrix::forward(const std::string &from, const std::string &to) { +double SubstitutionMatrix::forward(const char &from, const char &to) { size_t fromIdx, toIdx; fromIdx = this->edgeMap[from]; toIdx = this->edgeMap[to]; diff --git a/src/setriq/_C/metrics/CdrDist.cpp b/src/setriq/_C/metrics/CdrDist.cpp index cdf777d..5bd2971 100644 --- a/src/setriq/_C/metrics/CdrDist.cpp +++ b/src/setriq/_C/metrics/CdrDist.cpp @@ -14,7 +14,20 @@ metric::CdrDist::CdrDist(const doubleMatrix& matrix, const stringIndexMap& index } double metric::CdrDist::forward(const std::string &a, const std::string &b) { + /** + * Compute the CdrDist metric between two input strings. + * + * @param a: on input string to be compared + * @param b: second input string to be compared + * @return the CdrDist metric between the input strings + */ + + // calculate the numerator of CdrDist. This is the most expensive operation of the metric, as the sequences get + // aligned double abScore = this->algorithm(a, b); + + // when a == b in SW, the best score collapses down to a simple arithmetic sum over all the amino acid positions + // this gives a significant speed bump double aaScore = this->algorithm.identityScore(a); double bbScore = this->algorithm.identityScore(b); diff --git a/src/setriq/_C/metrics/TcrDist.cpp b/src/setriq/_C/metrics/TcrDist.cpp index 5df1795..65d6481 100644 --- a/src/setriq/_C/metrics/TcrDist.cpp +++ b/src/setriq/_C/metrics/TcrDist.cpp @@ -5,7 +5,21 @@ #include #include "metrics/TcrDist.h" -metric::TcrDist::TcrDist(const doubleMatrix& subMat, const stringIndexMap& index, double gapPen, char gapSym, double weight) { +metric::TcrDist::TcrDist(const doubleMatrix& subMat, + const stringIndexMap& index, + double gapPen, + char gapSym, + double weight) { + /** + * Initialize the TcrDist object. + * + * @param subMat: the substitution scoring matrix + * @param index: the token index + * @param gapPen: the metric gap penalty + * @param gapSym: the gap symbol to be used (e.g. "-") + * @param weight: the weight of the metric component output + */ + SubstitutionMatrix sm {subMat, index}; this->substitutionMatrix = sm; this->gapPenalty = gapPen; @@ -14,19 +28,26 @@ metric::TcrDist::TcrDist(const doubleMatrix& subMat, const stringIndexMap& index } double metric::TcrDist::forward(const std::string &a, const std::string &b) { - if (a.size() != b.size()) throw std::invalid_argument("input strings must be of equal length."); + /** + * Compute the TcrDist metric (Dash et al) between two provided sequences. Be sure to provide sequences of equal + * length. Errors are handled in the Python API for speed and interpretability considerations. + * + * @param a: a string to be compared + * @param b: another string to be compared + * @return TcrDist metric between the two strings + */ double distance {0}, substitution; for (size_t i = 0; i < a.size(); i++) { if (a[i] == b[i]) continue; if (a[i] == this->gapSymbol || b[i] == this->gapSymbol) { - distance += this->gapPenalty * this->distanceWeight; + distance += this->gapPenalty; continue; } - substitution = this->substitutionMatrix(a.substr(i, 1), b.substr(i, 1)); - distance += std::min(4., 4. - substitution) * this->distanceWeight; + substitution = this->substitutionMatrix(a[i], b[i]); + distance += std::min(4., 4. - substitution); } - return distance; + return distance * this->distanceWeight; } diff --git a/src/setriq/modules/_distances.py b/src/setriq/modules/_distances.py index eacfa58..ae7ad5b 100644 --- a/src/setriq/modules/_distances.py +++ b/src/setriq/modules/_distances.py @@ -8,6 +8,7 @@ from typing import Dict, List import numpy as np +import pandas as pd from glom import glom import setriq._C as C @@ -98,6 +99,8 @@ def __init__(self, self.fn = C.tcr_dist_component def forward(self, sequences: List[str]) -> List[float]: + if not (len(sequences[0]) == pd.Series(sequences).str.len()).all(): + raise ValueError('Sequences must be of equal length') out = self.fn(sequences, **self.call_args) return out @@ -178,7 +181,7 @@ def __init__(self, **components): for name, component in components.items(): # some type checking if not isinstance(component, TcrDistComponent): - raise TypeError(f'{repr(name)} is not of type {TcrDistComponent}') + raise TypeError(f'{repr(name)} is not of type {TcrDistComponent.__class__.__name__}') self.__setattr__(name, component) parts.append(name) @@ -199,7 +202,7 @@ def _check_input_format(self, ipt): diff = pts.difference(ipt) if diff: - raise ValueError('please inspect inputs - missing key(s): {}'.format(', '.join(map(repr, diff)))) + raise ValueError('Missing key(s): {}'.format(', '.join(map(repr, diff)))) @property def required_input_keys(self) -> List[str]: diff --git a/src/setriq/modules/_substitution.py b/src/setriq/modules/_substitution.py index f7ca1e3..13c4155 100644 --- a/src/setriq/modules/_substitution.py +++ b/src/setriq/modules/_substitution.py @@ -50,6 +50,7 @@ class SubstitutionMatrix(abc.ABC): here we can see that we can provide any arbitrary substitution matrix, but in general it is advised to use the pre-loaded BLOSUM matrices >>> [BLOSUM45, BLOSUM62, BLOSUM90] # choose one of the following + these are just instances of `SubstitutionMatrix`, initialised through `from_json` """ _required_keys = (