In [4]:
import pandas as pd
import sys
sys.path.append('../')
from src.agnostic_search.mgf.processer import process as mgf_processor
# from src.agnostic_search.driver import main

In [2]:
from collections import namedtuple
Namespace = namedtuple('Namespace',['IdFile', 'filter', 'max_peaks_per_target', 'mgfQueryFile', 'mgfTargetFile', 'mz_error', 'mz_error_type', 'output_directory'])

args = Namespace(IdFile='../data/id_dir/merged_compact_id_file.txt', filter=False, max_peaks_per_target=20, mgfQueryFile='../data/mgf/171202_Ecoli_ctrl2_1ug.mgf', mgfTargetFile='../data/mgf/171202_Ecoli_ctrl2_1ug.mgf', mz_error=20, mz_error_type='ppm', output_directory=None)

In [7]:
import os

def find_query_range(mgf_queries, low, high):
    lower_bound = 0
    upper_bound = len(mgf_queries)
    while lower_bound < upper_bound:
        mid = lower_bound + (upper_bound - lower_bound) // 2
        if low <= mgf_queries[mid].pepmass * mgf_queries[mid].charge:
            upper_bound = mid
        else:
            lower_bound = mid + 1
    low_idx = lower_bound
    lower_bound = 0
    upper_bound = len(mgf_queries)
  
    while lower_bound < upper_bound:
        mid = lower_bound + (upper_bound - lower_bound) // 2
        if high < mgf_queries[mid].pepmass * mgf_queries[mid].charge:
            upper_bound = mid
        else:
            lower_bound = mid + 1
    # Add 1 so that the last index is inclusive when using slice
    high_idx = upper_bound

    return low_idx, high_idx

def main(args):
      # Steps:
    # 1. Parse the id file
    # 2. Load the MGF files
    # 3. Compare scans
    #   3.1 Calculate target scan information
    #   3.2 Calculate search window
    #   3.3 Compare within window
    # 4. Filter
    print(os.getcwd())
    print(args)

    print("LOADING QUERY MGF")
    mgf_queries = mgf_processor(os.path.abspath(args.mgfQueryFile))
    print("LOADING TARGET MGF")
    mgf_targets = mgf_processor(os.path.abspath(args.mgfTargetFile))
    print("LOADING ID MGF")
    mgf_filename = os.path.basename(args.mgfQueryFile).split(".")[0]
    df = pd.read_table(args.IdFile, index_col="scan")
    df = df.loc[df["Filename"] == mgf_filename]
    # FIXME: This replaces NA values with empty string (either make this clearer or remove if not necessary)
    # df = df.where((pd.notnull(df)), "")
    df = df.fillna("")

    print(df, df.shape)
    count = 0
    q = list(filter(lambda query: query.scan in df.index, mgf_queries))
    r = []
    if df.shape[0]:
        for idx, target in enumerate(mgf_targets):

            window = target.get_window()
            index_range = find_query_range(q, *window)

            if idx % 1000 == 0:
                print(idx)

            for query in q[slice(*index_range)]:
                r.append(
                    target.count_matches(
                        query, args.mz_error, args.mz_error_type
                    )
                )
          # print("-----------")
          # print("target")
          # print(target)
          # print("query")
          # print(query)
    print(count)
    results = pd.DataFrame(
      r,
      columns=[
          "count",
          "target_matched",
          "query_matched",
          "target_log",
          "query_log",
      ],
    )
    return results

In [8]:
main(args)

/home/mark/Projects/fenyolab/agnostic_search_py3/notebooks
Namespace(IdFile='../data/id_dir/merged_compact_id_file.txt', filter=False, max_peaks_per_target=20, mgfQueryFile='../data/mgf/171202_Ecoli_ctrl2_1ug.mgf', mgfTargetFile='../data/mgf/171202_Ecoli_ctrl2_1ug.mgf', mz_error=20, mz_error_type='ppm', output_directory=None)
LOADING QUERY MGF
'process': '/home/mark/Projects/fenyolab/agnostic_search_py3/data/mgf/171202_Ecoli_ctrl2_1ug.mgf'  7.51 s
LOADING TARGET MGF
'process': '/home/mark/Projects/fenyolab/agnostic_search_py3/data/mgf/171202_Ecoli_ctrl2_1ug.mgf'  7.59 s
LOADING ID MGF
                peptide              modifications charge   expect  \
scan                                                                 
25300    APLDNDIGVSEATR                                 2    1e-15   
25039     ASCTTNCLAPLAK   57.02147@C3,57.02147@C7,      2  0.00056   
31700  CSHYPNHPLWYTLCDR  57.02147@C1,57.02147@C14,      3  2.4e-10   

      total_MS2_intensity                                

Unnamed: 0,count,target_matched,query_matched,target_log,query_log
0,0,0.000000,0.000000,3.816380,4.374254
1,0,0.000000,0.000000,3.625748,4.374254
2,2,0.007720,0.013145,4.605390,4.374254
3,3,0.023070,0.022065,4.354904,4.374254
4,2,0.014822,0.011668,4.270326,4.374254
...,...,...,...,...,...
25211,1,0.003583,0.003189,4.689083,4.739724
25212,0,0.000000,0.000000,3.986674,4.739724
25213,2,0.005208,0.005666,4.776338,4.739724
25214,0,0.000000,0.000000,4.252963,4.739724


In [14]:
match_df.sort_values(["target_matched"])

Unnamed: 0,count,target_matched,query_matched,target_log,query_log
0,0,0.000000,0.000000,3.816380,4.374254
15930,0,0.000000,0.000000,4.522317,4.691727
15934,0,0.000000,0.000000,4.734606,4.691727
15936,0,0.000000,0.000000,3.600220,4.691727
15947,0,0.000000,0.000000,3.854056,4.374254
...,...,...,...,...,...
24860,1,0.121442,0.022280,3.936648,4.739724
19508,1,0.122812,0.012863,3.759821,4.739724
22083,20,0.234634,0.234634,4.739724,4.739724
11092,20,0.276886,0.276886,4.691727,4.691727


In [16]:
samp_df = pd.read_table('../SAMPEI_output.txt')

In [23]:
samp_df[['diff_dalton', 'diff_dalton_bin',
       'query_scan', 'query_scan_mz', 'query_scan_charge', 'target_scan',
       'target_scan_mz', 'target_scan_charge', 'matches', 'matched_query',
       'matched_intensity_product', 'sum_log_intensity', 'peptide',
       'modifications', 'expect', 'total_MS2_intensity', 'proteins',
       'matched_peptide_intensity_max', 'largest_gap',
       'mod_with_maxmatchedintensity']]

Unnamed: 0,diff_dalton,diff_dalton_bin,query_scan,query_scan_mz,query_scan_charge,target_scan,target_scan_mz,target_scan_charge,matches,matched_query,matched_intensity_product,sum_log_intensity,peptide,modifications,expect,total_MS2_intensity,proteins,matched_peptide_intensity_max,largest_gap,mod_with_maxmatchedintensity
0,17.0090,17.0,69070,1289.320557,3,68346,971.242676,4,0.45,0.594226,0.348233,10.792598,AALAGGTTMIIDHVVPEPGTSLLAAFDQWREWADSK,"6.02013@K36,6.02013@R30,",2.800000e-11,338844.156139,"ENSP00000427985,ENSP00000309539",0.4,16,"6.02013@K36,6.02013@R30,17.0090332031@A2"
1,18.0092,18.0,69070,1289.320557,3,68298,1295.323608,3,0.40,0.540569,0.271003,10.137417,AALAGGTTMIIDHVVPEPGTSLLAAFDQWREWADSK,"6.02013@K36,6.02013@R30,",2.800000e-11,338844.156139,"ENSP00000427985,ENSP00000309539",0.3,15,"6.02013@K36,6.02013@R30,18.0091552734@A2"
2,18.0227,18.0,69070,1289.320557,3,67338,971.496094,4,0.35,0.508751,0.235322,11.147984,AALAGGTTMIIDHVVPEPGTSLLAAFDQWREWADSK,"6.02013@K36,6.02013@R30,",2.800000e-11,338844.156139,"ENSP00000427985,ENSP00000309539",0.4,16,"6.02013@K36,6.02013@R30,18.0227050781@A2"
3,19.0117,19.0,69070,1289.320557,3,67423,971.743347,4,0.35,0.516769,0.212074,11.213922,AALAGGTTMIIDHVVPEPGTSLLAAFDQWREWADSK,"6.02013@K36,6.02013@R30,",2.800000e-11,338844.156139,"ENSP00000427985,ENSP00000309539",0.4,16,"6.02013@K36,6.02013@R30,19.01171875@A2"
4,22.0232,22.0,69229,1287.649170,3,68346,971.242676,4,0.40,0.505962,0.267896,10.563961,AALAGGTTMIIDHVVPEPGTSLLAAFDQWREWADSK,"6.02013@R30,",6.500000e-02,223872.113857,"ENSP00000427985,ENSP00000309539",0.1,26,"6.02013@R30,22.0231933594@A1"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29601,11.9830,12.0,68124,693.037292,3,63489,697.031616,3,0.35,0.616064,0.365979,11.513984,YSLLPFWYTLLYQAHR,"6.02013@R16,",1.600000e-03,851138.038202,"ENSP00000442962,ENSP00000349053,ENSP0000034046...",0.0,15,
29602,-180.1938,-180.0,68124,693.037292,3,69422,949.459045,2,0.35,0.504239,0.227490,11.312543,YSLLPFWYTLLYQAHR,"6.02013@R16,",1.600000e-03,851138.038202,"ENSP00000442962,ENSP00000349053,ENSP0000034046...",0.0,15,
29603,10.9758,11.0,68124,693.037292,3,66908,1045.043823,2,0.40,0.620476,0.305163,11.505080,YSLLPFWYTLLYQAHR,"6.02013@R16,",1.600000e-03,851138.038202,"ENSP00000442962,ENSP00000349053,ENSP0000034046...",0.0,15,
29604,10.9758,11.0,68124,693.037292,3,66907,1045.043823,2,0.40,0.620476,0.265359,11.447239,YSLLPFWYTLLYQAHR,"6.02013@R16,",1.600000e-03,851138.038202,"ENSP00000442962,ENSP00000349053,ENSP0000034046...",0.0,15,


In [2]:
df = pd.read_table('data/id_dir/merged_compact_id_file.txt', index_col="scan")

In [3]:
df

Unnamed: 0_level_0,peptide,modifications,charge,expect,total_MS2_intensity,proteins,Filename
scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
28753,AAHPPFASWR,,2,0.00057,549540.873857625,sp_BGAL_ECOLI_,171202_Ecoli_ctrl1_1ug
23255,APLDNDIGVSEATR,,2,1.6e-13,602559.586074358,sp_BGAL_ECOLI_,171202_Ecoli_ctrl1_1ug
16153,AQYEDIAQK,,2,0.00047000000000000004,691830.9709189361,ENSP00000252244,171202_Ecoli_ctrl1_1ug
23031,ASCTTNCLAPLAK,"57.02147@C3,57.02147@C7,",2,2.6e-06,263026.799189538,"ENSP00000222286,ENSP00000229239,ENSP0000038006...",171202_Ecoli_ctrl1_1ug
35420,AVVELHTADGTLIEAEACDVGFR,"57.02147@C18,",3,5.8e-09,616595.001861482,sp_BGAL_ECOLI_,171202_Ecoli_ctrl1_1ug
...,...,...,...,...,...,...,...
17482,YSQQQLMETSHR,,3,8.9e-09,1122018.45430196,sp_BGAL_ECOLI_,171202_Ecoli_ctrl1_1ug
scan,peptide,modifications,charge,expect,total_MS2_intensity,proteins,Filename
25300,APLDNDIGVSEATR,,2,1e-15,204173.794466953,sp_BGAL_ECOLI_,171202_Ecoli_ctrl2_1ug
25039,ASCTTNCLAPLAK,"57.02147@C3,57.02147@C7,",2,0.00056,102329.299228075,"ENSP00000229239,ENSP00000380065,ENSP0000038006...",171202_Ecoli_ctrl2_1ug


In [None]:
df.loc['F']

In [6]:
query = process('data/mgf/171202_Ecoli_ctrl2_1ug.mgf')

In [20]:
31700 in [int(x.scan) for x in query]

True

In [21]:
df.loc[df.index.isin([x.scan for x in query])]

Unnamed: 0_level_0,peptide,modifications,charge,expect,total_MS2_intensity,proteins,Filename
scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
28753,AAHPPFASWR,,2,0.00057,549540.873857625,sp_BGAL_ECOLI_,171202_Ecoli_ctrl1_1ug
23255,APLDNDIGVSEATR,,2,1.6e-13,602559.586074358,sp_BGAL_ECOLI_,171202_Ecoli_ctrl1_1ug
16153,AQYEDIAQK,,2,0.00047000000000000004,691830.9709189361,ENSP00000252244,171202_Ecoli_ctrl1_1ug
23031,ASCTTNCLAPLAK,"57.02147@C3,57.02147@C7,",2,2.6e-06,263026.799189538,"ENSP00000222286,ENSP00000229239,ENSP0000038006...",171202_Ecoli_ctrl1_1ug
35420,AVVELHTADGTLIEAEACDVGFR,"57.02147@C18,",3,5.8e-09,616595.001861482,sp_BGAL_ECOLI_,171202_Ecoli_ctrl1_1ug
...,...,...,...,...,...,...,...
24432,YHHPAGEAFWTGER,,2,0.0007099999999999999,194984.459975805,ENSP00000325519,171202_Ecoli_ctrl1_1ug
17482,YSQQQLMETSHR,,3,8.9e-09,1122018.45430196,sp_BGAL_ECOLI_,171202_Ecoli_ctrl1_1ug
25300,APLDNDIGVSEATR,,2,1e-15,204173.794466953,sp_BGAL_ECOLI_,171202_Ecoli_ctrl2_1ug
25039,ASCTTNCLAPLAK,"57.02147@C3,57.02147@C7,",2,0.00056,102329.299228075,"ENSP00000229239,ENSP00000380065,ENSP0000038006...",171202_Ecoli_ctrl2_1ug


In [23]:
df["Filename"].unique()

array(['171202_Ecoli_ctrl1_1ug', 'Filename', '171202_Ecoli_ctrl2_1ug'],
      dtype=object)

In [24]:
df.loc[df["Filename"] == "171202_Ecoli_ctrl2_1ug"]

Unnamed: 0_level_0,peptide,modifications,charge,expect,total_MS2_intensity,proteins,Filename
scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
25300,APLDNDIGVSEATR,,2,1e-15,204173.794466953,sp_BGAL_ECOLI_,171202_Ecoli_ctrl2_1ug
25039,ASCTTNCLAPLAK,"57.02147@C3,57.02147@C7,",2,0.00056,102329.299228075,"ENSP00000229239,ENSP00000380065,ENSP0000038006...",171202_Ecoli_ctrl2_1ug
31700,CSHYPNHPLWYTLCDR,"57.02147@C1,57.02147@C14,",3,2.4e-10,501187.233627272,sp_BGAL_ECOLI_,171202_Ecoli_ctrl2_1ug


In [2]:
import math

math.copysign(ceil(abs(diff)), diff)

-5.0