# Notebook 1
Raw data export into np.memaps.

### Load important modules

In [1]:
# Standard modules
import numpy as np
import os
import logging

import pickle

# Move to root directory for easier module handling
os.chdir("../..")
print(os.listdir("."))
from notebooks.data_processing.modules.maldi_conversion import process_raw_data, extract_raw_data
from notebooks.data_processing.modules.lookup_tables import process_lookup_tables
from modules.tools.misc import delete_all_files_in_folder

# multithreading/multiprocessing
from multiprocessing import Pool
from threadpoolctl import threadpool_limits

# set thread limit
threadpool_limits(16)


['data', 'pages', 'app.py', 'assets', 'config.py', 'index.py', 'main.py', 'TODO.py', 'notebooks', 'modules', '__pycache__']


<threadpoolctl.threadpool_limits at 0x7fab806330d0>

### Create a list of raw data filenames

In [2]:
# Load filenames
l_t_names = sorted(
    [
        [
            int(name.split("MouseBrainCMC_S")[1].split("_")[0].split("A")[0].split("(")[0]),
            "/data/lipidatlas/data/data_raw/" + name + "/" + name,
        ]
        for name in os.listdir("/data/lipidatlas/data/data_raw/")
        if "MouseBrain" in name
    ]
)

# Correct for duplicates
for t_names_1, t_names_2 in zip(l_t_names[:-1], l_t_names[1:]):
    if t_names_2[0] == t_names_1[0]:
        t_names_2.append("bis")
        print("WARNING: duplicate for slice " + str(t_names_1[0]))

# Remove slices that have already been processed
path = "notebooks/data_processing/data/temp/"
os.makedirs(path, exist_ok=True)
remove_already_loaded = False
if remove_already_loaded:
    existing_names = [int(name.split("_")[1][:-7]) for name in os.listdir(path) if "raw" in name]
    l_t_names = [x for x in l_t_names if x[0] not in existing_names]

# Print the final list of names
for t_names in l_t_names:
    print(t_names[0], t_names[1].split("/")[-1])


1 20210210_MouseBrainCMC_S1AA1_2Dpixelmode_322x231_Att25_25um
2 20210211_MouseBrainCMC_S2AB5_2Dpixelmode_370x214_Att25_25um
3 20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um
4 20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um
5 20210218_MouseBrainCMC_S5AE3_2Dpixelmode_396x272_Att25_25um
6 20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
7 20210220_MouseBrainCMC_S7AF5_2Dpixelmode_427x263_Att25_25um
8 20210531_MouseBrainCMC_S8_duplicate_2Dpixelmode_430x285_Att30_25um
9 20210224_MouseBrainCMC_S9AH4_2Dpixelmode_467x278_Att25_25um
10 20210210_MouseBrainCMC_S10(brain2_20)_394x282_Att30_25um
11 20210301_MouseBrainCMC_S11AK5_2Dpixelmode_448x277_Att25_25um
12 20210303_MouseBrainCMC_S12AL1_2Dpixelmode_393x266_Att25_25um
13 20210304_MouseBrainCMC_S13AM1_2Dpixelmode_413x310_Att25_25um
14 20210305_MouseBrainCMC_S14AN1_2Dpixelmode_409x285_Att25_25um
15 20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um
16 20210530_MouseBrainCMC_S16_duplicate_2Dpixelmode_454


### Extract raw data into numpy arrays with multiprocessing

In [None]:
# multiprocessing = False
# if multiprocessing:
#     with Pool(processes=14) as pool:
#         [x for x in pool.imap_unordered(extract_raw_data, l_t_names)]
# else:
#     # Normal (single-processed) map
#     [x for x in map(extract_raw_data, l_t_names)]

### Process raw data into numpy arrays with multiprocessing

In [3]:
multiprocessing = True
if multiprocessing:
    with Pool(processes=16) as pool:
        [x for x in pool.imap_unordered(process_raw_data, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_raw_data, l_t_names)]


Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Sorting by m/z value for averaging before standardization
Getting spectrums array averaged accross pixels
Prepare data for standardization
Standardize data
Compute and normalize pixels values according to TIC
259 260 0.00020909094018861651 0.00020909107788173194 0.7323678265553686
261 262 0.00023867991694714874 0.00023867981542903581 0.7064667092510108
260 261 0.000220037778490223 0.00022003785506540088 0.6592818897258698
246 247 0.0002658390731085092 0.00026583909108768473 0.7541586468642276
242 243 0.00024476039106957614 0.00024476022940102005 0.7619305478315267
259 260 0.0002585422189440578 0.0002585427014217025 0.7266627927932897
236 237 0.0002806723932735622 0.00028067202322002555 0.6870055776594983
257 258 0.00023221634910441935 0.00023221643880791247 0.8508497356713519
192 193 0.00015526567585766315 0.00015526564457235746 0.8090616897674328
6 7 0.0008591866935603321 0.0009368371136991118 0.

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


Sorting by m/z value for averaging
Filtering out noise and matrix peaks
Sorting by m/z value for averaging before standardization
Getting spectrums array averaged accross pixels
Prepare data for standardization
Compute and normalize pixels values according to TIC
Sorting by m/z value for averaging before standardization
Getting spectrums array averaged accross pixels
Prepare data for standardization
Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Getting spectrums array averaged accross pixels
Sorting by m/z value for averaging before standardization
Build the low-resolution averaged array from the high resolution averaged array
Getting spectrums array averaged accross pixels
Double sorting according to pixel and mz high-res array
Filtering out noise and matrix peaks
Prepare data for standardization
Compute and normalize pixels values according to TIC
Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Gett

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


152 153 0.0005593067035079002 0.0005593066697345533 0.6565821428987713
Sorting by m/z value for averaging
169 170 0.0005222613108344376 0.0005222610546084389 0.7155247865644582
242 243 0.00047499826177954674 0.00047499813498811825 0.7334216009787746
193 194 0.0005526317399926484 0.0005526315999058041 0.6539835462734332
193 194 0.000645816617179662 0.0006458163693645641 0.746555827398627
184 185 0.0004394371062517166 0.00043943742689775975 1.4001555340558915
135 136 0.0005773706361651421 0.0005773707863894767 1.2124723968339228
181 182 0.0005951510975137353 0.0005951510435942118 0.7987352681537834
169 170 0.0005345893441699445 0.0005345900177230406 0.7677911734042524
208 209 0.0004172151966486126 0.00041721490867238826 0.775364909467155
178 179 0.0006008964264765382 0.0006008965523233876 0.6474876498088503
179 180 0.0005092447972856462 0.0005092443534425429 1.245695989568349
151 152 0.0007271288777701557 0.0007271290479905223 0.7344295887785274
166 167 0.0006377596291713417 0.0006377594

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


Sorting by m/z value for averaging
Prepare data for standardization
Compute and normalize pixels values according to TIC
Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Standardize data
Double sorting according to pixel and mz high-res array
Compute and normalize pixels values according to TIC
Getting spectrums array averaged accross pixels
Filtering out noise and matrix peaks
Build the low-resolution averaged array from the high resolution averaged array
Compute and normalize pixels values according to TIC
Compute and normalize pixels values according to TIC
Double sorting according to pixel and mz high-res array
Sorting by m/z value for averaging before standardization
Filtering out noise and matrix peaks
Getting corresponding spectra arrays
Saving : /data/lipidatlas/data/data_raw/20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um/20210214_MouseBrainCMC_S4AD3_2Dpixelmode_354x228_Att25_25um
176 17

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


Getting corresponding spectra arrays
Sorting by m/z value for averaging
Saving : /data/lipidatlas/data/data_raw/20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um/20210213_MouseBrainCMC_S3AC4_2Dpixelmode_371x195_Att25_25um
Filtering out noise and matrix peaks
Prepare data for standardization
Sorting by m/z value for averaging before standardization
Compute and normalize pixels values according to TIC
Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Getting spectrums array averaged accross pixels
Filtering out noise and matrix peaks
Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Filtering out noise and matrix peaks
Double sorting according to pixel and mz high-res array
Sorting by m/z value for averaging before standardization
Filtering out noise and matrix peaks
Getting spectrums array averaged accross pixels
Sorting by m/z value for averaging before standa

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Getting spectrums array averaged accross pixels
Filtering out noise and matrix peaks
Prepare data for standardization
Sorting by m/z value for averaging before standardization
Prepare data for standardization
Compute and normalize pixels values according to TIC
Compute and normalize pixels values according to TIC
Getting spectrums array averaged accross pixels
Sorting by m/z value for averaging before standardization
Getting spectrums array averaged accross pixels
Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Build the low-resolution averaged array from the high resolution averaged array
Getting spectrums array averaged accross pixels
Filtering out noise and matrix peaks
Double sorting according to pixel and mz high-res array
Prepare data for standardization
Sorting by m/z value for averaging before standardization
Standardize data
Filtering out noise and matrix peaks
Double sorting according to pixel and

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


Sorting by m/z value for averaging
Sorting by m/z value for averaging before standardization
Sorting by m/z value for averaging before standardization
Standardize data
Getting spectrums array averaged accross pixels
Getting spectrums array averaged accross pixels
55 56 0.00010033092985395342 0.00010033107146530457 0.02122744405150109
95 96 0.0006638108170591295 0.0006638108420449313 3.1009277911447053
104 105 0.0008007140131667256 0.0008007140904661133 1.5644139695359796
108 109 0.0008776001632213593 0.0008776000697451794 0.2878726456942257
213 214 0.0003677276545204222 0.0003677278619414159 0.4662788584685144
223 224 Prepare data for standardization0.0003454159596003592
 0.00034541611019530045 0.010289039833458625
239 240 0.00030341953970491886 0.00030341956745708 0.4997321929407901
232 233 0.00035648071207106113 0.0003564809215437263 0.03271176080435093
209 210 0.0003423012385610491 0.0003423012586249426 0.7444435309084877
234 235 0.0003599407500587404 0.00035994080951346174 0.652843

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


 

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


0.0004103313840460032 0.00041033141317781737 0.9691046359075782
246 247 0.00035458794445730746 0.0003545880569998804 1.0021185945080535
288 289 0.0002196691930294037 0.00021966899547037525 0.8241220990130073
190 191 0.0004106445412617177 0.0004106444876488605 1.073523567824804
233 234 0.00026778681785799563 0.0002677868577960893 1.0379824770665633
207 208 0.00023266073549166322 0.00023266077737450514 1.0445709631603664
221 222 0.00039218307938426733 0.00039218277163556624 0.9985514245759726
197 198 0.0004047604452352971 0.0004047606302225213 0.9400073183728765
254 255 0.000234968974837102 0.00023496894236241434 0.9280385856271883
205 206 0.00032013317104429007 0.0003201329972642738 1.0784898340789906
Sorting by m/z value for averaging
198 199 0.00033774878829717636 0.000337748448995974 1.0446087707325145
215 216 0.0002821027592290193 0.00028210267422072927 1.048334170193533
181 182 0.00048261028132401407 0.00048260956331763185 0.9418334407466535
181 182 0.0004329873190727085 0.00043298

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



 0.94878616415976140.0006423412705771625
 0.0006423414792736688 1.1063771579649706
111220  112221  0.00139172701165080070.0002973801747430116  0.00139172704768044350.00029738031242446106  1.02542086660621720.7314438398359694

199 200 0.00040862601599656045 0.0004086260500701589 1.0380525093404456
160 161 0.0003915557754226029 0.0003915556815478039 0.667963231674966
7 868  0.001176753314211964669  0.0011767537811842890.0022460336331278086  1.18915598906792220.0022460352215562087
 0.6794436011457252
180 181 0.0004214416549075395 0.0004214411909981343 1.0216563166299797
204 205 2630.0003246983396820724  2640.0003246984324935083  0.000298152299365028740.885233965697281 
0.0002981524738619075 0.7308983365114816
256 257 2140.0002508390462026  0.0002508392484699097215  0.99477233626538560.0003973481943830848
 0.0003973481128111409 1.0756065797588477
221 222 0.0004468703700695187 0.0004468695252590254 1850.7318163082003672 
186 0.0004292568482924253210  0.0004292570104771236211  1.055492406919

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


258 259 0.00023586837050970644 0.00023586874562090258 0.9837464342564087
3 4 0.0009611791465431452 0.0009611797446359484 0.41164973004188476
179 180 0.0006076986901462078 0.0006076987962007547 0.799447308591496
165 166 0.0007506399415433407 0.000750640111391854 0.729928188006479
177 178 0.000622342515271157 0.0006223419653407848 0.8206890669013146
229 230 0.000354878167854622 0.0003548778475548883 0.9455984042673155
187 188 0.00025509099941700697 0.00025509078606974014 0.9698864556878262
177 178 0.0007071697618812323 0.0007071694712364408 0.7618436912610037
200 201 0.0004824633651878685 0.0004824630940525279 0.6912317223868883
194 195 0.0006053686956875026 0.0006053686715540924 0.8040775834590926
206 207 0.00013159347872715443 0.00013159331581191573 0.8386037431017352
191 192 0.00054579769494012 0.0005457978423852089 0.6860732304097813
216 217 0.0003176663303747773 Sorting by m/z value for averaging0.0003176663027520966
 0.9171183109523724
296 297 0.00017796897736843675 0.0001779689411

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



240
 241 0.00047557917423546314 0.00047557942604168637 0.7787192603337809
274 275 0.00021700609067920595 0.00021700575307408276223  1.1302587184145965224
 0.0004584902198985219 0.00045848978723946857 0.7719947130827968
159 160 0.0006151096895337105 0.0006151097507225559 0.7816698247760304
363 364 0.00018003513105213642 0.00018003503080778454 0.7465292371757981
206 207 0.00024284509709104896 0.00024284521742750734239  0.8967234789061581240
 0.0004581478424370289 0.0004581479772215079 0.755101119055968
254 255 0.0002606834168545902 0.00026068306333233053 1.0586598390936979
204 205 0.0005596898845396936 0.000559690032337781 0.8249036259441755
211 212 0.00041601157863624394 0.0004160118850696717 0.7230952545356463
208188  209189  0.000360521808033809070.0005190300289541483  0.00036052176611213880.0005190301836733802  0.84985293565617090.627200043782189

190205  191206  0.00040521335904486480.0003551880654413253  0.000405213296250691270.00035518799078120347  0.56672551403161311.057308203450

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



0.00046681522508151830.0005552160437218845  0.0004668154606300342640.0005552164567132253   650.5653319471018350.7526543111057155
 
0.0005117977852933109 0.0005117978280013416218  0.7343206842011465219
 0.00044183986028656363 0.00044183938498305517 0.8301331449286111Sorting by m/z value for averaging

228 205229  2060.0003135676961392164  0.00128583912737667560.0003135674190841084  0.00128584050087606530.9390902482470932 
0.9452042565897415223249
  224250211   0.0003448194474913180.00047757482388988137212   0.000344818869024863670.000477574900384158760.0013669554609805346   1.02203764952220260.85613570461613762210.001366955434569999

  2220.9162281433118453247 
 0.00033902114955708385248 233 0.00033902087035093904 0.00038766441866755486 234 1.1562599635769828 0.00038766470701605743
0.0014530977932736278  0.96108047058088670.0014530980605734407
 0.8568197522595511264
 265287270   0.0003039471048396081288271   0.00030394722024534330.000282227032585069540.0003329276805743575   1871.0856466

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)



 215251

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


0.9639412879607635  
0.0004802370967809111252 
0.00048023739520349705251240   1.090766210966109252241
  0.0008220561430789530.0002944648440461606 187 170 0.00099588458437747180.00029446506689993253 188  171 0.99034463166944661.1007881704749136 0.0003620864008553326

 0.000331288552843034270.00036208652220352463 189 0.0003312883253500893 0.8422008369936197 190
1920.8400116032802635  
0.0004174595815129578193252   0.00041745948953456993020.0018404091242700815253    0.80057532863323121763030.0018404108749662910.00024191207194235176
    1770.000192157662240788340.94204628074373360.00024191197683457995   
0.0004975781776010990.000192157432336648070.9923492266124224  
0.00049757781170109231.1487529623307096 
0.49507032252389005
202 203195  0.0003570905246306211196  0.00035709021369277810.0005148156778886914 217 0.8285994817897366 0.000514816033654089199
218  128028 0.7887499934987595200 0.00038225355092436075
 have been transformed, with an average of 188 0.00062234589131549  0.0003822536581

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


    200 176212192

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


    2531640.00061757344519719480.000419657415477558970.000734151981305331 0.00048110962961800396     0.00061757301697043280.000481109747541122070.00041965770650118210.00053954083705320950.00073415220992494270.0004963134415447712254       0.62057571633524630.00034198016510345040.00049631290124589081.0870996511300330.0005395407938524321Prepare data for standardization


 201 0.00036205272772349417 0.0003620530151086415 0.8703056912129763
188 189 0.00047830084804445505 0.00047830077412249335 1.0930879164128275

209 214210  2150.0002996386610902846  0.00029963859971369540.0004377435252536088  0.000841076485812664 0.0008410763584526199 0.4471101382572675
1.1226807594008092
234 235 0.00026522469124756753 1590.00026522489859495164  1601.1858199849863673 
0.0007890469278208911234 0.0007890468223060497  2350.5693528851246075 
40.00041962010436691344  50.0004196198174904992Getting spectrums array averaged accross pixels  0.0008197724819183352151.0421155806438838
 
 0.0008197713637571665 2160.692

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



0.0002422686337476935 1.039840663773605
218 219 0.00017986141028814018 0.00017986124258119625 0.7967548820054916
229 230 0.0004037269973196089 0.00040372706198531943 0.749231362791428
258 259
 0.0002955770178232342 0.00029557699065640985 0.9184040802095402
266 267 0.0001877570030046627 0.00018775688581343212 1.0189270922374105
 0.895817262049132
116 117 0.0003396538377273828 0.0003396536570961407 0.734640844453116
46 5  70.0008921713451854885  0.00061597995227202770.0008921713165783935  0.00061597998481444350.8551734878228102 
0.8396793560662436
6 7 0.0010494963498786092 1990.0010494974905757397   2000.8319865381490288 
0.000384619176594816950.0004144123522564769  1570.46474593136429590.0004144121886030137127
   0.88462697782865480.0008381515508517623128
  0.00083815109099972140.00023389601847156882  1840.496034191581360640.00023389602564035267 
 2072271850.7567896821120756   
2080.0005865737912245095228  0.0006141888443380594 0.0005865732511890654154 0.0004987511201761663  0.000614188

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


2661981520.00040888687362894416   

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


   0.000420434080297127370.00037817070888868780.00026696099666878580.0003605216334108263199 0.00040888705358081776    0.00042043407922796687 0.93651320151354670.00026696102784306120.00036052177434546560.0004783525946550071 0.7960514037550485
  0.8172442138847383 
0.92615203140620380.9212151286598697
0.00047835207662095744

 1.0129158004316117
211286  2122872   0.00034804333699867130.00041168119059875613  220 0.000348043373881344860.0004116809259551435 0.0007760405424050987  221 1.16558862108336720.7369307792052997 0.0007760402744519084

0.00023764323850627989  2090.82961955105186470.0002376432846590309 
189 210 1.0929276292121615 1900.0004219875845592469
  0.00060869654407724742400.0004219876801347723  233 0.0006086968417461028241 1.0969335729665186  234
0.86490084894543320.000285182410152629 
 0.000369953893823549152100.000285182422181087 209  0.000369953845591864 2110.9462572584194808 210 
0.8593461568204398 0.0004483298980630934
0.0002578617131803185  0.000448329901045818760.0002578

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


0.875887320214127  


  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


0.00024723303752727320.0003430313263298898  1.1514783520960431.1555874527668206

180 181 0.0005471925833262503 2110.0005471922869752997  2120.7813728067184805256 
 0.00034838152350857854178257   0.000348381782245696531790.0003159349726047367254    0.94817839956866570.00049739913083612920.00031593530581714054255
   0.00049739909477265970.9341420704802190.0002488059108145535 
 0.77945969293825390.0002488060195141757
 2430.8626528938322634 
244208  0.00036579061998054385209 157 0.00036579064340980233 0.00042796722846105695 158 0.9220944951931301 0.0004279668940201734
0.0006821949500590563 244 0.9857722374711677 0.0006821945629206532
245 243 0.7895065025486304 
0.0003646989935077727195244   0.000364698804828196271960.00027879909612238407   0.000288934126729145650.81009837871984610.00027879914235180313 
 0.000288934113950671130.6942339362619872 
0.9157810050886908189
 190208 191 0.00031970307463780046 209 192 0.00031970316539887766 0.00029075302882120013 0.00044666259782388806 212 0.8519319

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



 0.00032763410126790404 0.0003276342976563963 1.6610982514469121
201 202 0.00029500186792574823 0.0002950021587876145 1.492197016491507
202 203 0.00032128815655596554 0.0003212880675039565 1.2289640076609396
174 175 0.0005473491037264466 0.0005473489564054693 1.5705908488708216
189 190 0.0005845004343427718 0.0005845002369391202 1.5668630256747875
169 170 0.00042977198609150946 0.00042977222969398023 1.290615001043215
137 138 0.0001330486556980759 0.00013304856756098652 3.896520126688799
236 237 0.00027008671895600855 0.00027008660803083174 2.293282546573537
426 427 0.00021298901992850006 0.00021298912457315031 1.919806068311168
253 254 7.333454414037988e-05 0.00016975835718186308 1.9125867656021818
383 384 0.0002651048998814076 0.00026510503372093475 1.8067743375048784
519 520 0.0002356920304009691 0.0002356919110266708 1.907032899986065
148 149 0.00020985412993468344 0.0002098540837313921 1.0229130079565547
480 481 0.0003151045530103147 0.00031510433359539715 1.662582768306008
368 36

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


Sorting by m/z value for averaging
Getting corresponding spectra arrays
Saving : /data/lipidatlas/data/data_raw/20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um/20210219_MouseBrainCMC_S6AE3_2Dpixelmode_423x282_Att25_25um
Getting spectrums array averaged accross pixels
Getting spectrums array averaged accross pixels
Sorting by m/z value for averaging before standardization
Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Double sorting according to pixel and mz high-res array
Build the low-resolution averaged array from the high resolution averaged array
Double sorting according to pixel and mz high-res array
Getting spectrums array averaged accross pixels
Standardize data
Getting spectrums array averaged accross pixels
Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Double sorting according to pixel and mz high-res arr

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


Sorting by m/z value for averaging
Double sorting according to pixel and mz high-res array
Build the low-resolution averaged array from the high resolution averaged array
Getting corresponding spectra arrays
Getting corresponding spectra arrays
Double sorting according to pixel and mz high-res array
Saving : /data/lipidatlas/data/data_raw/20210408_MouseBrainCMC_S21AU4_2Dpixelmode_394x215_Att30_25um/20210408_MouseBrainCMC_S21AU4_2Dpixelmode_394x215_Att30_25um
Saving : /data/lipidatlas/data/data_raw/20210409_MouseBrainCMC_S22AV1_2Dpixelmode_416x207_Att30_25um/20210409_MouseBrainCMC_S22AV1_2Dpixelmode_416x207_Att30_25um
Getting corresponding spectra arrays
Double sorting according to pixel and mz high-res array
Saving : /data/lipidatlas/data/data_raw/20210323_MouseBrainCMC_S18AR4_2Dpixelmode_474x291_Att30_25um/20210323_MouseBrainCMC_S18AR4_2Dpixelmode_474x291_Att30_25um
Getting corresponding spectra arrays
Saving : /data/lipidatlas/data/data_raw/20210319_MouseBrainCMC_S17AQ2_2Dpixelmode_4

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


Sorting by m/z value for averaging
Getting corresponding spectra arrays
230 231 0.0003836624382529408 0.0003836624658118425 0.9330221692464298
213 214 0.00036428411840461195 0.0003642840869502373 1.0176155757261076
210 211 0.00030366191640496254 0.0003036618805529437 1.008806139004369
156 157 0.0005187556962482631 0.0005187557806047324 0.9147967579109654
183 204184  2050.00013957709597889334  0.00065020378679037090.00013957703292645363  0.0006502035521499181.3359379211012894 
0.600553640974517
216 217 0.00036236748564988375 0.0003623675206767509 0.8976826646080124
203 204 0.0003160439373459667 0.00031604361673546913 1.0770416790870256
200 201 1910.00029347746749408543  1920.00029347746180306665  0.0003990472177974881.2223409824601001 0.000399046888812669
 0.7502607730142122
134 135 0.0005273866700008512 0.0005273868510194278 0.9092368533339573
166 167 0.0004449370317161083 0.00044493696351950745 1.013304687622646
174 175 0.0002774223103187978161  0.0002774223802400365162  1.12863613884

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


107253 have been transformed, with an average of  0.0003263311981949223 peaks transformed
Sorting by m/z value for averaging
Sorting by m/z value for averaging
Compute and normalize pixels values according to TIC
Filtering out noise and matrix peaks
Getting spectrums array averaged accross pixels
Getting corresponding spectra arrays
Build the low-resolution averaged array from the high resolution averaged array
Double sorting according to pixel and mz high-res array
Getting spectrums array averaged accross pixels
Saving : /data/lipidatlas/data/data_raw/20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um/20210313_MouseBrainCMC_S15AO2_2Dpixelmode_451x292_Att25_25um
Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Double sorting according to pixel and mz high-res array
Sorting by m/z value for averaging before standardization
Build the low-resolution averaged array from the high resolution averaged arr

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


Sorting by m/z value for averaging
222 223 0.00017803760420065373 0.00017803787504757273 0.6570495145304921
201 202 0.00015653074660804123 0.00015653066087734785 0.5812638268333137
299 300 0.00021153924171812832 0.00021153929939623332 1.454887043741065
244 245 0.0003211548610124737 0.0003211549659883219 1.4834419699791488
254 255 0.0001265893515665084 0.00012658946546251768 0.5314215037462263
207 208 0.00012945651542395353 0.0001294565849216277 0.5334473747395513
283 284 0.00015992512635421008 0.00015992508208867798 1.5972216851163665
266 267 0.0002749153063632548 0.00027491539327193534 1.3790714775262973
202 203 0.0001698770938673988 0.00016987710360018103 0.7100950713409826
269 270 0.00023707006766926497 0.0002370701535647207 1.5367374218840684
209 210 0.00018841464770957828 0.00018841467180298648 0.6222342311671614
308 309 0.0001539144868729636 0.00015391447453576003 1.6165215535747022
191 192 0.00013588838919531554 0.00013588834733316692 0.541018757511502
286 287 0.0002644154883455

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Standardize data
Getting spectrums array averaged accross pixels
Filtering out noise and matrix peaks
Prepare data for standardization
554 555 0.00026654399698600173 0.0002665438909937314 0.12185995299169657
360 361 0.000260584318311885 0.0002605844336916013 0.12464694404234092
224 225 0.000259850436123088 0.0002598504265434911 0.8868116580441744
215 216 0.00025523125077597797 0.00025523161741433604 1.2937031104559242
219 220 0.00023299372696783394 0.00023299314348318537 0.0435488834477001
253 254 0.0001775517303030938 0.00017755180151896401 0.08432172835891893
247 248 0.00025848549557849765 0.0002584858096452776 1.0162464783515026
283 284 0.00018366494623478502 0.00018366499732818238 0.04324427345739868
291 292 0.00024192375713028014 0.00024192409235982473 0.03298931017462147
279 280 0.0002600692387204617 0.00026006928060431196 1.1228529469463158
128 129 0.0002742937649600208 0.0002742939866576702 2.3646399202432935
204 205 0.0002893288619816303 0.0002893287489502647 0.915442543322458

  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)
  array_corrective_factors = np.nan_to_num(arrays_after_transfo / arrays_before_transfo)


Sorting by m/z value for averaging
Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Double sorting according to pixel and mz high-res array
Getting corresponding spectra arrays
Saving : /data/lipidatlas/data/data_raw/20210423_MouseBrainCMC_S28_3_2Dpixelmode_390x244_Att30_25um/20210423_MouseBrainCMC_S28_3_2Dpixelmode_390x244_Att30_25um
Getting spectrums array averaged accross pixels
Sorting by m/z value for averaging before standardization
Getting spectrums array averaged accross pixels
Build the low-resolution averaged array from the high resolution averaged array
Double sorting according to pixel and mz high-res array
Build the low-resolution averaged array from the high resolution averaged array
Double sorting according to pixel and mz high-res array
Getting corresponding spectra arrays
Saving : /data/lipidatlas/data/data_raw/20210504_MouseBrainCMC_S32_3_2Dpixelmode_298x230_Att30_25um/20210504_MouseBrainCM

### Build lookup tables

In [4]:
multiprocessing = True
if multiprocessing:
    # Multiprocessing
    with Pool(processes=16) as pool:
        [x for x in pool.map(process_lookup_tables, l_t_names)]
else:
    # Normal (single-processed) map
    [x for x in map(process_lookup_tables, l_t_names)]



Size (in mb) of lookup_table_spectra_high_res:  551.95
Shape of lookup_table_spectra_high_res:  (2000, 72345)
Size (in mb) of lookup_table_spectra_high_res:  604.1
Shape of lookup_table_spectra_high_res:  (2000, 79180)
Size (in mb) of lookup_table_spectra_high_res:  615.78
Shape of lookup_table_spectra_high_res:  (2000, 80712)
Size (in mb) of lookup_table_spectra_high_res:  821.78
Shape of lookup_table_spectra_high_res:  (2000, 107712)
Size (in mb) of lookup_table_spectra_high_res:  567.49
Shape of lookup_table_spectra_high_res:  (2000, 74382)
Size (in mb) of lookup_table_spectra_high_res:  910.08
Shape of lookup_table_spectra_high_res:  (2000, 119286)
Size (in mb) of lookup_table_spectra_high_res:  797.56
Shape of lookup_table_spectra_high_res:  (2000, 104538)
Size (in mb) of lookup_table_spectra_high_res:  889.32
Shape of lookup_table_spectra_high_res:  (2000, 116565)
Size (in mb) of lookup_table_spectra_high_res:  847.69
Shape of lookup_table_spectra_high_res:  (2000, 111108)
Size (

### Record everything and clean 

Record everything in memap files and a pickled dictonnary

In [5]:
output_folder = "data/whole_dataset/"
input_folder = "notebooks/data_processing/data/temp/"
os.makedirs(output_folder, exist_ok=True)

dic_slices = {}
# Loop over slice files
for slice_name in os.listdir(input_folder):
    print(slice_name)
    if 'raw' in slice_name or 'checkpoints' in slice_name:
        continue
    # Extract slice index
    slice_index = int(slice_name.split("_")[1][:-4])

    # Load slice arrays
    npzfile = np.load(input_folder + slice_name)
    array_pixel_indexes_high_res = npzfile["array_pixel_indexes_high_res"]
    array_spectra_high_res = npzfile["array_spectra_high_res"]
    array_averaged_mz_intensity_low_res = npzfile["array_averaged_mz_intensity_low_res"]
    array_averaged_mz_intensity_high_res = npzfile["array_averaged_mz_intensity_high_res"]
    array_averaged_mz_intensity_high_res_before_standardization = npzfile["array_averaged_mz_intensity_high_res_before_standardization"]
    image_shape = npzfile["image_shape"]
    divider_lookup = npzfile["divider_lookup"]
    lookup_table_spectra_high_res = npzfile["lookup_table_spectra_high_res"]
    cumulated_image_lookup_table_high_res = npzfile["cumulated_image_lookup_table_high_res"]
    lookup_table_averaged_spectrum_high_res = npzfile["lookup_table_averaged_spectrum_high_res"]
    array_peaks_corrected = npzfile["array_peaks_corrected"]
    array_corrective_factors = npzfile["array_corrective_factors"]

    # Print array size
    # print size used by each array in mb
    print(round(array_pixel_indexes_high_res.nbytes / 1024 / 1024, 2))
    print(round(array_spectra_high_res.nbytes / 1024 / 1024, 2))
    print(round(array_averaged_mz_intensity_low_res.nbytes / 1024 / 1024, 2))
    print(round(array_averaged_mz_intensity_high_res.nbytes / 1024 / 1024, 2))
    print(round(lookup_table_spectra_high_res.nbytes / 1024 / 1024, 2))
    print(round(cumulated_image_lookup_table_high_res.nbytes / 1024 / 1024, 2))
    print(round(lookup_table_averaged_spectrum_high_res.nbytes / 1024 / 1024, 2))

    # Register the lightweights files in a pickled dictionnary
    dic_slices[slice_index] = {
        "image_shape": image_shape,
        "divider_lookup": divider_lookup,
        "array_avg_spectrum_downsampled": array_averaged_mz_intensity_low_res,
        "array_lookup_pixels": array_pixel_indexes_high_res,
        "array_lookup_mz_avg": lookup_table_averaged_spectrum_high_res,
        "array_peaks_transformed_lipids": array_peaks_corrected,
        "array_corrective_factors": array_corrective_factors,
    }

    # Build a memap for each of the heavier files to save RAM, save the corresponding shape in the
    # pickled dictionnary
    fp = np.memmap(
        output_folder + "array_spectra_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_spectra_high_res.shape,
    )
    fp[:] = array_spectra_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_spectra_shape"] = array_spectra_high_res.shape

    fp = np.memmap(
        output_folder + "array_avg_spectrum_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_averaged_mz_intensity_high_res.shape,
    )
    fp[:] = array_averaged_mz_intensity_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_avg_spectrum_shape"] = array_averaged_mz_intensity_high_res.shape

    fp = np.memmap(
        output_folder + "array_avg_spectrum_before_standardization" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=array_averaged_mz_intensity_high_res_before_standardization.shape,
    )
    fp[:] = array_averaged_mz_intensity_high_res_before_standardization[:]
    fp.flush()
    dic_slices[slice_index]["array_avg_spectrum_before_standardization_shape"] = array_averaged_mz_intensity_high_res_before_standardization.shape


    fp = np.memmap(
        output_folder + "array_lookup_mz_" + str(slice_index) + ".mmap",
        dtype="int32",
        mode="w+",
        shape=lookup_table_spectra_high_res.shape,
    )
    fp[:] = lookup_table_spectra_high_res[:]
    fp.flush()
    dic_slices[slice_index]["array_lookup_mz_shape"] = lookup_table_spectra_high_res.shape

    fp = np.memmap(
        output_folder + "array_cumulated_lookup_mz_image_" + str(slice_index) + ".mmap",
        dtype="float32",
        mode="w+",
        shape=cumulated_image_lookup_table_high_res.shape,
    )
    fp[:] = cumulated_image_lookup_table_high_res[:]
    fp.flush()
    dic_slices[slice_index][
        "array_cumulated_lookup_mz_image_shape"
    ] = cumulated_image_lookup_table_high_res.shape

# Pickle the dict of lightweight data
with open(output_folder + "light_arrays.pickle", "wb") as handle:
    pickle.dump(dic_slices, handle)


slice_14raw.npz
slice_8raw.npz
slice_26raw.npz
slice_13raw.npz
slice_15raw.npz
slice_16raw.npz
slice_21raw.npz
slice_22raw.npz
slice_23raw.npz
slice_12raw.npz
slice_30raw.npz
slice_6raw.npz
slice_9raw.npz
slice_31raw.npz
slice_1.npz
0.57
148.1
0.05
0.98
567.49
567.49
0.01
slice_2.npz
0.6
158.53
0.06
1.1
604.1
604.1
0.01
slice_3.npz
0.55
71.81
0.05
0.94
551.95
551.95
0.01
slice_4.npz
0.62
61.66
0.05
1.32
615.78
615.78
0.01
slice_5.npz
0.82
136.45
0.05
1.13
821.78
821.78
0.01
slice_10.npz
0.85
273.05
0.06
1.35
847.69
847.69
0.01
slice_7.npz
0.86
218.57
0.06
1.24
856.79
856.79
0.01
slice_12.npz
0.8
226.24
0.06
1.21
797.56
797.56
0.01
slice_6.npz
0.91
210.06
0.06
1.32
910.08
910.08
0.01
slice_11.npz
0.95
251.87
0.06
1.25
946.78
946.78
0.01
slice_8.npz
0.93
162.56
0.06
1.2
934.98
934.98
0.01
slice_9.npz
0.99
270.59
0.06
1.36
990.49
990.49
0.01
slice_14.npz
0.89
223.25
0.06
1.24
889.32
889.32
0.01
slice_13.npz
0.98
232.6
0.08
1.8
976.79
976.79
0.01
slice_19.npz
0.7
144.88
0.05
1.12
700.93
70

Clean temporary folder

In [None]:
clean = False
if clean:
    delete_all_files_in_folder(input_folder)
