From 293995d56b59811b0398d28a6925c8d283393ffc Mon Sep 17 00:00:00 2001 From: BenTenmann Date: Sun, 5 Dec 2021 19:02:39 +0000 Subject: [PATCH] feat: add tcr sub-mat default --- src/setriq/data/tcr-dist-blosum-62.json | 654 ++++++++++++++++++++++++ src/setriq/modules/distances.py | 14 +- src/setriq/modules/substitution.py | 65 ++- 3 files changed, 726 insertions(+), 7 deletions(-) create mode 100644 src/setriq/data/tcr-dist-blosum-62.json diff --git a/src/setriq/data/tcr-dist-blosum-62.json b/src/setriq/data/tcr-dist-blosum-62.json new file mode 100644 index 0000000..6920fcd --- /dev/null +++ b/src/setriq/data/tcr-dist-blosum-62.json @@ -0,0 +1,654 @@ +{ + "index":{ + "A":0, + "R":1, + "N":2, + "D":3, + "C":4, + "Q":5, + "E":6, + "G":7, + "H":8, + "I":9, + "L":10, + "K":11, + "M":12, + "F":13, + "P":14, + "S":15, + "T":16, + "W":17, + "Y":18, + "V":19, + "B":20, + "Z":21, + "X":22, + "*":23 + }, + "substitution_matrix":[ + [ + 0, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4 + ], + [ + 4, + 0, + 4, + 4, + 4, + 3, + 4, + 4, + 4, + 4, + 4, + 2, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4 + ], + [ + 4, + 4, + 0, + 3, + 4, + 4, + 4, + 4, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 3, + 4, + 4, + 4, + 4, + 1, + 4, + 4, + 4 + ], + [ + 4, + 4, + 3, + 0, + 4, + 4, + 2, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 0, + 3, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 0, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4 + ], + [ + 4, + 3, + 4, + 4, + 4, + 0, + 2, + 4, + 4, + 4, + 4, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 1, + 4, + 4 + ], + [ + 4, + 4, + 4, + 2, + 4, + 2, + 0, + 4, + 4, + 4, + 4, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 3, + 0, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 0, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4 + ], + [ + 4, + 4, + 3, + 4, + 4, + 4, + 4, + 4, + 0, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + 4, + 4, + 4, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 0, + 2, + 4, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 1, + 4, + 4, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + 0, + 4, + 2, + 4, + 4, + 4, + 4, + 4, + 4, + 3, + 4, + 4, + 4, + 4 + ], + [ + 4, + 2, + 4, + 4, + 4, + 3, + 3, + 4, + 4, + 4, + 4, + 0, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 3, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 3, + 2, + 4, + 0, + 4, + 4, + 4, + 4, + 4, + 4, + 3, + 4, + 4, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 0, + 4, + 4, + 4, + 3, + 1, + 4, + 4, + 4, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 0, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4 + ], + [ + 3, + 4, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 0, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 3, + 0, + 4, + 4, + 4, + 4, + 4, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 3, + 4, + 4, + 4, + 0, + 2, + 4, + 4, + 4, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 2, + 4, + 4, + 4, + 4, + 1, + 4, + 4, + 4, + 2, + 0, + 4, + 4, + 4, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 1, + 3, + 4, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 0, + 4, + 4, + 4, + 4 + ], + [ + 4, + 4, + 1, + 0, + 4, + 4, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 0, + 3, + 4, + 4 + ], + [ + 4, + 4, + 4, + 3, + 4, + 1, + 0, + 4, + 4, + 4, + 4, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 3, + 0, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4 + ], + [ + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 3 + ] + ] +} \ No newline at end of file diff --git a/src/setriq/modules/distances.py b/src/setriq/modules/distances.py index 629dff4..c5cfaa9 100644 --- a/src/setriq/modules/distances.py +++ b/src/setriq/modules/distances.py @@ -15,7 +15,8 @@ from .substitution import ( BLOSUM45, BLOSUM62, - SubstitutionMatrix + SubstitutionMatrix, + TCR_DIST_BLOSUM_DEFAULT ) __all__ = [ @@ -154,8 +155,8 @@ class TcrDist(Metric): Examples -------- >>> sequences = [ - ... {'cdr_1': '', 'cdr_2': '', 'cdr_3': 'CASSLKPNTE'}, - ... {'cdr_1': '', 'cdr_2': '', 'cdr_3': 'CASS-HIANY'}, + ... {'cdr_1': 'TSG------FNG', 'cdr_2': 'VVL----DGL', 'cdr_2_5': 'SRSN-GY', 'cdr_3': 'CAVR--'}, + ... {'cdr_1': 'TSG------FYG', 'cdr_2': 'NGL----DGL', 'cdr_2_5': 'SRSD-SY', 'cdr_3': 'CA-------'}, ... {'cdr_1': '', 'cdr_2': '', 'cdr_3': 'CASRGAT--Q'} ... ] >>> metric = TcrDist() # will produce a warning stating default configuration (Dash et al) @@ -168,9 +169,10 @@ class TcrDist(Metric): epitope-specific T cell receptor repertoires. Nature, 547(7661), pp.89-93. (https://doi.org/10.1038/nature22383) """ _default = [ - ('cdr_1', {'substitution_matrix': BLOSUM62, 'gap_penalty': 4., 'weight': 1.}), - ('cdr_2', {'substitution_matrix': BLOSUM62, 'gap_penalty': 4., 'weight': 1.}), - ('cdr_3', {'substitution_matrix': BLOSUM62, 'gap_penalty': 8., 'weight': 3.}) + ('cdr_1', {'substitution_matrix': TCR_DIST_BLOSUM_DEFAULT, 'gap_penalty': 4., 'weight': 1.}), + ('cdr_2', {'substitution_matrix': TCR_DIST_BLOSUM_DEFAULT, 'gap_penalty': 4., 'weight': 1.}), + ('cdr_2_5', {'substitution_matrix': TCR_DIST_BLOSUM_DEFAULT, 'gap_penalty': 4., 'weight': 1.}), + ('cdr_3', {'substitution_matrix': TCR_DIST_BLOSUM_DEFAULT, 'gap_penalty': 8., 'weight': 3.}) ] _default_msg = ( 'TcrDist has been initialized using the default configuration. ' diff --git a/src/setriq/modules/substitution.py b/src/setriq/modules/substitution.py index 13c4155..0a73adc 100644 --- a/src/setriq/modules/substitution.py +++ b/src/setriq/modules/substitution.py @@ -15,10 +15,12 @@ import srsly __all__ = [ - 'SubstitutionMatrix', 'BLOSUM45', 'BLOSUM62', 'BLOSUM90', + 'SubstitutionMatrix', + 'TCR_DIST_BLOSUM62', + 'TCR_DIST_BLOSUM_DEFAULT' ] @@ -107,6 +109,9 @@ def from_json(cls, file_path: Union[str, pathlib.Path]) -> "SubstitutionMatrix": model = cls(**values) return model + def __len__(self): + return len(self.substitution_matrix) + def __getitem__(self, key: str): out = self.__getattribute__(key) return out @@ -143,6 +148,56 @@ def __call__(self, a: str, b: str) -> float: out = self.substitution_matrix[i][j] return out + def add_token(self, token: str, values: Union[float, List[float]]): + """ + Add a special token to the substitution matrix with a given value or list of values. + + Parameters + ---------- + token : str + a special token to be added + values : Union[float, List[float]] + a value or list of values to which the token will correspond. If a list of floats is provided, the list must + have a length of `len(substitution_matrix) + 1`, i.e. there must be number of rows + 1 elements in the list + + Returns + ------- + None + this is an inplace operation + + Examples + -------- + Single value example. The value is repeated to fit the required shape + >>> sm = BLOSUM62 + >>> sm.add_token('-', 4.) + + List of floats example + >>> sm = BLOSUM62 + >>> len(sm) + ... 24 + >>> sm.add_token('setriq', [*range(26)]) # ints implicitly converted to floats + + """ + if token in self.index: + raise ValueError('`token` already exists') + + # generate the full row in case of single value + if isinstance(values, float): + values = [values for _ in range(len(self.substitution_matrix[0]) + 1)] + + # check if first dimension fits + if len(values) - 1 != len(self.substitution_matrix): + raise ValueError('`values` and `substitution_matrix` must have same dimension 0') + + # append the values in the new row to the each existing row to enforce symmetry + values = [*map(float, values)] + for scoring_row, new_value in zip(self.substitution_matrix, values[:-1]): + scoring_row.append(new_value) + + # append the new row and add new token + self.substitution_matrix.append(values) + self.index[token] = len(self.substitution_matrix) - 1 + # below we load the matrices which come with the package -- this exposes them to the user # they will be used for default settings in a number of metrics @@ -152,3 +207,11 @@ def __call__(self, a: str, b: str) -> float: BLOSUM45 = SubstitutionMatrix.from_json(DATA_DIR / 'blosum-45.json') BLOSUM62 = SubstitutionMatrix.from_json(DATA_DIR / 'blosum-62.json') BLOSUM90 = SubstitutionMatrix.from_json(DATA_DIR / 'blosum-90.json') + +# the below is a pre-computed substitution matrix for the default TcrDist configuration +# note that the the gap symbol is omitted from this for flexibility +TCR_DIST_BLOSUM62 = SubstitutionMatrix.from_json(DATA_DIR / 'tcr-dist-blosum-62.json') + +# here we have an instance *with* the default gap symbol +TCR_DIST_BLOSUM_DEFAULT = SubstitutionMatrix.from_json(DATA_DIR / 'tcr-dist-blosum-62.json') +TCR_DIST_BLOSUM_DEFAULT.add_token('-', 4.)