# **MetaboTandem**

# C. Annotation with SIRIUS

# 1. Separating MS2 into groups

This Notebook allows the to separate the extracted MS2 features in several groups so they can be annotated using SIRIUS.

## 1.1 Loading required libraries

In [13]:
import pandas as pd
import numpy as np
import glob, os, re, sys, shutil
from natsort import natsorted
from itertools import zip_longest

## 1.2 User Inputs Required <====
Provide the following variables:
- **file:** Filename of the MS2 spectra consensus file (`.mgf` file)
- **dirname:** Directory to store the groups

In [20]:
file = os.path.join('..', '..', 'data', 'ms2_spectrum_consensus.mgf')
dirname = os.path.join('..', '..', 'data')

## 1.3 Check data
Checking that all data has the correct charge, if not the mgf will need to be filtered

In [17]:
%%bash -s "$file"
cat $1 | grep "CHARGE" | sort | uniq -c

    193 CHARGE=1-
     10 CHARGE=2-


## 1.4 Defining functions to separate data into groups

In [10]:
###############################################
# Functions:
###############################################

### get begin and end indices into tuples:
def get_indices(file):
    begin_line_indexes = []
    end_line_indexes = []

    with open(file,'r') as ms2_consensus:
        for i, line in enumerate(ms2_consensus.readlines()):
            if 'BEGIN' in line:
            #   extract line index for lines that contain BEGIN
                begin_line_indexes.append(i)

            elif 'END' in line:
                # return index of line after //
                end_line_indexes.append(i+1)

    begin_end_tuple = list(zip(begin_line_indexes, end_line_indexes))

    return begin_end_tuple

#### grouped() groups n features into mgf files
def grouper(iterable, n, fillvalue=None):
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)


###############################################

## 1.5 Separating data into groups

### Creating directory to store groups

In [15]:
if os.path.exists(dirname):
    shutil.rmtree(dirname)
os.makedirs(dirname)

### Getting the indices of each spectra
This step will identify the line index on which each spectra begins and ends so they can be correctly separated

In [18]:
begin_end_tuple = get_indices(file)

print(len(begin_end_tuple))

count = 0
for b in begin_end_tuple:
    if b[1]-b[0] < 9:
        begin_end_tuple.pop(count)
        count+=1
    else:
        count+=1

print(len(begin_end_tuple))
        

lineList = list()
with open(file, 'r') as f:
    lineList = [line.rstrip() for line in f]

203
203


### Separating the spectra into files containing 5 spectra each

In [19]:
### Initializing count
count = 1

for group in list(grouper(begin_end_tuple , 5)):
    with open(dirname+'/group_'+str(count)+'.mgf','w') as out:
        if None in group:
            none_index = group.index(None)
            group = group[:none_index] # easy fix
            for g in group:
                ms2 = lineList[g[0]:g[1]]
                for i in ms2:
                    out.write(i+'\n')
                out.write('\n')
            count+=1
        else:
            for g in group:
                ms2 = lineList[g[0]:g[1]]
                for i in ms2:
                    out.write(i+'\n')
                out.write('\n')
            count+=1