In [2]:
# Read the contents of all_usi.txt and split by lines
with open('all_usi.txt', 'r') as file:
    usis = file.readlines()

# Strip newline characters from each line
usis = [line.strip() for line in usis]

# Display the lines
print(usis[:10])  # Display only the first 10 lines

['mzspec:PXD015899:CPTAC_OvC_JB5429_iTRAQ_18_4Apr12_Cougar_12-03-22:scan:08441:[iTRAQ4plex]-LLIYGGSTR/2', 'mzspec:PXD015899:CPTAC_OvC_JB5429_iTRAQ_18_4Apr12_Cougar_12-03-22:scan:08441:[iTRAQ4plex114]-LLIYGGSTR/2', 'mzspec:PXD019910:20180329_EMIF_tryp_s35_f03:scan:11092:[TMT6plex]-LLIYGGSTR/2', 'mzspec:PXD019910:20180329_EMIF_tryp_s44_f03:scan:11199:[TMT6plex]-LLIYGGSTR/2', 'mzspec:PXD006463:MS161140-B-20_161024053838:scan:18934:[TMT6plex]-LLIYGGSTR/2', 'mzspec:PXD019910:20180329_EMIF_tryp_s23_f03_20180622040828:scan:11088:[TMT6plex]-LLIYGGSTR/2', 'mzspec:PXD019910:20180329_EMIF_tryp_s16_f03:scan:10529:[TMT6plex]-LLIYGGSTR/2', 'mzspec:PXD019910:20180329_EMIF_tryp_s21_f03:scan:10390:[TMT6plex]-LLIYGGSTR/2', 'mzspec:PXD019910:20180329_EMIF_tryp_s15_f03:scan:10501:[TMT6plex]-LLIYGGSTR/2', 'mzspec:MSV000086793:22CPTAC_LUAD_P_BI_20180726_BD_f08:scan:26048:[TMT6plex]-LLIYGGSTR/2']


In [14]:
from collections import defaultdict

# Define the labeling types and their corresponding keywords
labeling_types = {
    "No isobaric labeling": [],
    "[iTRAQ4plex]": ["[iTRAQ4plex]", "[iTRAQ4plex114]"],
    "[iTRAQ8plex]": ["[iTRAQ8plex]", "[iTRAQ8plex:13C(6)15N(2)]"],
    "[TMT6plex]": ["[TMT6plex]"],
    "[TMTpro]": ["[TMTpro]"],
    "[Dimethyl]": ["[Dimethyl]", "[Dimethyl:2H(6)13C(2)]", "[Dimethyl:2H(4)13C(2)]", "[Dimethyl:2H(4)]"]
}

# Group USIs by labeling type
grouped_usis = defaultdict(list)

for usi in usis:
    matched = False
    for label, keywords in labeling_types.items():
        if any(keyword in usi for keyword in keywords):
            grouped_usis[label].append(usi)
            matched = True
            break
    if not matched:
        grouped_usis["No isobaric labeling"].append(usi)

# Display the counts for each group
for label, group in grouped_usis.items():
    print(f"{label}: {len(group)} USIs")

    # Find USIs matched to multiple groups
    usi_to_groups = defaultdict(list)

    for label, usis_group in grouped_usis.items():
        for usi in usis_group:
            usi_to_groups[usi].append(label)

# Print USIs matched to more than one group
for usi, labels in usi_to_groups.items():
    if len(set(labels)) > 1:
        print(f"USI: {usi} matched to groups: {labels}")

[iTRAQ4plex]: 20116 USIs
[TMT6plex]: 50160 USIs
No isobaric labeling: 181331 USIs
[TMTpro]: 765 USIs
[Dimethyl]: 3887 USIs
[iTRAQ8plex]: 591 USIs


In [15]:
import re

# Extract USIs from "No isobaric labeling"
no_isobaric_usis = grouped_usis["No isobaric labeling"]

# Find all modifications (text within brackets)
modifications = set()
for usi in no_isobaric_usis:
    modifications.update(re.findall(r'\[.*?\]', usi))

# Print the modifications
print(modifications)

{'[+46.03274]', '[+471.20776]', '[+454.18121]', '[Oxidation]', '[+1541.85014]', '[+1431.83104]', '[Glu->pyro-Glu]', '[Thiazolidine]', '[Methylthio]', '[Gln->pyro-Glu]', '[Deamidated:18O(1)]', '[+1555.95614]', '[Label:13C(6)]', '[+186.1165]', '[LRGG]', '[Dicarbamidomethyl]', '[+75.04729]', '[Phospho]', '[Pyro-carbamidomethyl]', '[Deamidated]', '[+46.0328]', '[+85.05549]', '[+141.1154]', '[Nethylmaleimide]', '[Carbamidomethyl]', '[+186.1127]', '[+141.11544]', '[DiLeu4plex117]', '[Xlink:BuUrBu[85]', '[Label:2H(4)]', '[Cysteinyl]', '[Propionamide]', '[+271.1736]', '[0.0233]', '[ADP-Ribosyl]', '[+271.1735]', '[Acetyl]', '[Label:13C(6)15N(2)]', '[Label:13C(6)15N(4)]', '[GG]', '[Label:13C(6)15N(1)]', '[Ammonia-loss]'}


generate a tsv that with '[isobaric group][modification] count' in a way that: for each group, for each usi in that group, if it has a modification(a bracket and is not a isobaric grouop),count the number of that modificaiton for that group. if the usi does not have mod, add the count to [isobaric group][None] count

In [31]:
from collections import defaultdict
import csv

# Initialize a dictionary to store counts for each group and modification
modification_counts = defaultdict(lambda: defaultdict(int))

# Iterate through each group and its USIs
all_multiple_labeling_count = 0
for group, usis_group in grouped_usis.items():
    multiple_labeling_count = 0
    for usi in usis_group:
        # Extract modifications from the USI
        modifications_in_usi = re.findall(r'\[.*?\]', usi)
        non_isobaric_modifications = [
            mod for mod in modifications_in_usi if mod not in labeling_types[group]
        ]

        # Check if the USI has multiple labels
        if len(set(non_isobaric_modifications)) > 1:
            multiple_labeling_count += len(set(modifications_in_usi))

        # If no modifications are found, increment the count for [group][None]
        if not non_isobaric_modifications:
            modification_counts[group]["[None]"] += 1
        else:
            # Increment the count for each modification
            for mod in non_isobaric_modifications:
                modification_counts[group][mod] += 1
    all_multiple_labeling_count += multiple_labeling_count

# Write the counts to a TSV file
with open("modifications_count.tsv", "w", newline="") as tsvfile:
    writer = csv.writer(tsvfile, delimiter="\t")
    writer.writerow(["Group", "Modification", "Count"])
    for group, mods in modification_counts.items():
        for mod, count in mods.items():
            writer.writerow([group, mod, count])

    # Calculate the total number of USIs processed
    total_count = sum(sum(mods.values()) for mods in modification_counts.values())

    # Add the total count to the TSV file
    # writer.writerow(["Total", "", total_count])

    # Compare the total count to the original count of USIs
    print(f"Original USI count: {len(usis)}")
    print(f"Processed USI count: {total_count}")
    print(f"Counts match: {len(usis) == total_count}")

print(all_multiple_labeling_count)

Original USI count: 256850
Processed USI count: 293525
Counts match: False
43600


for verify with TMTpro no modificaiton number

In [33]:
# Count the number of USIs with [TMTpro] tag and no other modification tags
tmtpro_no_modification_count = sum(
    1 for usi in usis if '[TMTpro]' in usi and len([tag for tag in re.findall(r'\[.*?\]', usi) if tag != '[TMTpro]']) == 0
)

print(f"Number of USIs with [TMTpro] tag and no other modification tags: {tmtpro_no_modification_count}")

Number of USIs with [TMTpro] tag and no other modification tags: 593
