## Preprocessing of feature table

**Author**: Madeleine Ernst (mernst@ucsd.edu) <br>
**Edited by**: - <br>
**Use case**: Remove selected features (e.g. impurities, lock mass) from MZmine preprocessed feature table, add metadata and put into appropriate format for PCoA analysis in http://dorresteinappshub.ucsd.edu:3838/clusterMetaboApp0.9.1/. <br>
**Input file format**: <br>
<ul>
<li>**Feature table** (.csv) with features in rows and samples in columns retrieved from MZmine. </li>
<li>**List of features to remove** (.txt) Table containing m/z values of features to be removed (see template attached to notebook). The column containing the m/z values of the features to be removed should be named "MonoisotopicMass_Ion". </li> 
<li>**Metadata table** (.csv) with samples in rows and metadata categories in columns. The column containing the sample names should be named "filename".</li>
</ul>
**Outputs**: .csv table with samples in rows and metadata as well as features in columns. This table can be imported to http://dorresteinappshub.ucsd.edu:3838/clusterMetaboApp0.9.1/ for PCoA analysis. <br>
**Dependencies**: R version 3.3.3 (2017-03-06) and library plyr_1.8.4

load libraries

In [1]:
mylib <- '~/Rlib'

if (!dir.exists(mylib)){
    dir.create(mylib)
}
library(gplots, lib.loc = mylib)
library(RColorBrewer, lib.loc = mylib)
suppressMessages(library(plyr))


Attaching package: ‘gplots’

The following object is masked from ‘package:stats’:

    lowess



read feature table

In [2]:
mat_MS1dat <- read.csv("MSCourse_Group4_Yogurt_FeatureFinding_Filtered.csv", header=TRUE)
colnames(mat_MS1dat) <-  gsub(".filtered.Peak.area","",colnames(mat_MS1dat))
head(mat_MS1dat)

row.ID,row.m.z,row.retention.time,G95973_repeat_RG9_01_31632.mzXML,G95964_RA4_01_31456.mzXML,G95965_RH3_01_31592.mzXML,G95984_repeat_RG10_01_31635.mzXML,G95960_repeat_RB10_01_31547.mzXML,G95952_BA6_01_31659.mzXML,G95955_RE10_01_31522.mzXML,⋯,G96202_repeat_RD6_01_31617.mzXML,G96194_RD5_01_31500.mzXML,G96196_RC6_01_31488.mzXML,G96191_RA8_01_31461.mzXML,G96177_repeat_RG12_01_31637.mzXML,G96178_repeat2_RH10_01_31679.mzXML,G96185_repeat_RD3_01_31615.mzXML,G96193_repeat_RD8_01_31648.mzXML,G96192_RH7_01_31602.mzXML,X
1,219.0184,0.2193807,1610375.4,2222779.8,2462434.3,13013.14,4242.123,15796.87,3535.226,⋯,120711.8,15059.606,13548.987,22152.4,236221.2,230796.0,7392.643,9123.73,2321.386,
2,203.0525,0.2354085,1450424.6,1642752.8,2147674.5,30427.82,18359.641,8924.331,24435.71,⋯,210498.4,359178.8025,228354.882,275761.14,177629.8,172605.8,306544.222,197865.61,423937.476,
3,254.1611,0.3855692,613483.0,662945.1,757138.9,1368442.98,1291443.233,1280366.644,899953.138,⋯,1387922.8,471375.179,1001945.634,253830.14,1723692.2,1357316.9,466612.776,360977.41,270817.996,
4,311.0811,4.3231482,516360.2,444867.3,549168.7,526619.1,696526.75,367393.904,510855.601,⋯,506525.1,322042.1635,454425.889,285665.1,447963.6,381666.1,468795.122,241655.89,235303.84,
5,383.1157,0.3488201,616527.7,710269.3,869053.0,0.0,0.0,0.0,0.0,⋯,0.0,770.7915,2252.492,42883.73,0.0,0.0,4898.056,0.0,40525.056,
6,163.0601,0.2869653,588713.5,809426.7,669007.6,942627.03,776336.2,856284.362,1190917.058,⋯,1214042.1,758127.5915,799216.653,389281.77,643398.1,307122.4,980278.561,382153.59,378584.398,


read metadata table

In [3]:
metadata <- read.csv("MScourse2018_Global_metadata_FINAL_JMG_6.11.18.txt", header=TRUE, sep= "\t")
head(metadata)

additives,additives_specific,ATTRIBUTE_age,age_units,animal_product_substitute,ATTRIBUTE_animal_source,animal_source_specific,barcode_number,best_by,botanical_anatomy,⋯,title,touch_animal_product,type_of_cuisine_ethnicity,upc,vegan,ATTRIBUTE_Vendor_store,washed_not,washed_specific,ATTRIBUTE_Group_num,filename_image
no,not applicable,not applicable,not applicable,no,turkey,meat,96149,4/25/18,not applicable,⋯,Global FoodOmics,yes,not applicable,71627077805.0,no,Trader Joe's,not applicable,not applicable,1,not collected
no,not applicable,not applicable,not applicable,no,turkey,meat,96150,4/25/18,not applicable,⋯,Global FoodOmics,yes,not applicable,71627077805.0,no,Trader Joe's,not applicable,not applicable,1,not collected
no,not applicable,not applicable,not applicable,no,turkey,meat,96151,4/26/18,not applicable,⋯,Global FoodOmics,yes,not applicable,71627077805.0,no,Trader Joe's,not applicable,not applicable,1,not collected
no,not applicable,not applicable,not applicable,no,cow,meat,96158,4/21/18,not applicable,⋯,Global FoodOmics,yes,not applicable,211653000000.0,no,Ralph's,not applicable,not applicable,1,not collected
no,not applicable,not applicable,not applicable,no,cow,meat,96159,4/21/18,not applicable,⋯,Global FoodOmics,yes,not applicable,211653000000.0,no,Ralph's,not applicable,not applicable,1,not collected
no,not applicable,not applicable,not applicable,no,cow,meat,96160,4/21/18,not applicable,⋯,Global FoodOmics,yes,not applicable,211653000000.0,no,Ralph's,not applicable,not applicable,1,not collected


read features to remove

In [5]:
rem_tab <- read.csv("list_ions_remove.txt", header=TRUE, sep="\t")
head(rem_tab)

Compound_Name,Formula_Neutral,MonoisotopicMass_Neutral,Species,MonoisotopicMass_Ion,X,X.1
622 lock mass,C12H18F12N3O6P3,621.0217,[M+H]+,622.029,,
622 lock mass,C12H18F12N3O6P3,621.0217,[M+Na]+,644.0109,,
922 lock mass,C18H18F24N3O6P3,921.0025,[M+H]+,922.0098,,
922 lock mass,C18H18F24N3O6P3,921.0025,[M+Na]+,943.9917,,
sulfadimethoxine,C12H14N4O4S,310.0736,[M+H]+,311.0809,,
sulfamethizole,C9H10N4O2S2,270.0245,[M+H]+,271.0318,,


In [6]:
# function calculating ppm error range
ppm <- function(x,ppmerr){
    xnegative <- x-(x*ppmerr/1e6)
    xplus <- x+(x*ppmerr/1e6)
    return(c(xnegative,xplus))
}

# function finding values within ppm error range
in_interval <- function(x, interval){ 
   stopifnot(length(interval) == 2L) 
   interval[1] < x & x < interval[2] 
} 

In [7]:
rem <- rem_tab$MonoisotopicMass_Ion

specify your ppm error tolerance (here 10 ppm)

In [8]:
tol <- 10

create candidate list with m/z ranges to be removed

In [9]:
cand <- as.data.frame(t(sapply(rem,ppm,tol)))
cand

V1,V2
622.0228,622.0352
644.0045,644.0173
922.0006,922.019
943.9823,944.0011
311.0778,311.084
271.0291,271.0345
279.0882,279.0938
285.0179,285.0237
287.0352,287.041
278.1875,278.1931


In [10]:
ids <- c()
for (i in 1:nrow(cand)){
    ids <- c(ids,which(sapply(mat_MS1dat$row.m.z,in_interval,cand[i,])))
}
ids <- unique(ids)

show m/z values, that will be removed

In [11]:
mat_MS1dat$row.m.z[ids]

put caffeine peak back in (for coffee and tea samples)

In [12]:
ids <- ids[-which(ids==1)]

remove selected m/z values from feature table

In [13]:
mat_MS1dat <- mat_MS1dat[-ids,]

transpose feature table and combine with metadata

In [14]:
mat_MS1dat <- t(mat_MS1dat)

In [15]:
colnames(mat_MS1dat) <- paste(mat_MS1dat[1,],round(mat_MS1dat[2,],4),round(mat_MS1dat[3,],2),sep="_")
mat_MS1dat <- mat_MS1dat[-c(1:3),]
mat_MS1dat <- as.data.frame(mat_MS1dat)
mat_MS1dat$filename <- rownames(mat_MS1dat)

In [16]:
mat_MS1dat

ERROR while rich displaying an object: Error in sprintf(wrap, header, body): 'fmt' length exceeds maximal format length 8192

Traceback:
1. FUN(X[[i]], ...)
2. tryCatch(withCallingHandlers({
 .     rpr <- mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler), error = outer_handler)
3. tryCatchList(expr, classes, parentenv, handlers)
4. tryCatchOne(expr, names, parentenv, handlers[[1L]])
5. doTryCatch(return(expr), name, parentenv, handler)
6. withCallingHandlers({
 .     rpr <- mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler)
7. mime2repr[[mime]](obj)
8. repr_latex.data.frame(obj)
9. repr_matrix_generic(obj, sprintf("\\begin{tabular}{%s}\n%%s%%s\\end{tabular}\n", 
 .     cols), "%s\\\\\n\\hline\n", "  &", " %s &", "%s", "\t%s\\\\\n", 
 .     "%s &", " %s &", escape_fun = latex_escape_vec, ...)
10. sprintf(

Unnamed: 0,1_219.0184_0.22,2_203.0525_0.24,3_254.1611_0.39,5_383.1157_0.35,6_163.0601_0.29,7_258.1101_0.26,8_204.123_0.3,9_381.0795_0.23,10_998.0813_4.93,11_220.118_0.54,⋯,8264_425.1928_3.02,8265_434.2235_3.52,8266_1072.8171_5.17,8267_553.2846_4.2,8268_337.0815_0.82,8269_553.2816_3.87,8270_781.6198_8.87,8271_337.0815_0.83,8272_471.1023_0.51,filename
G95973_repeat_RG9_01_31632.mzXML,1610375.404,1450424.601,613483.0,616527.687,588713.5,399022.4,650828.5,629961.8,185988.72,324385.0,⋯,2090.6195,447.9725,1486.940,1196.2780,5333.4170,204.1250,0.0000,5333.4170,283.9680,G95973_repeat_RG9_01_31632.mzXML
G95964_RA4_01_31456.mzXML,2222779.797,1642752.815,662945.1,710269.310,809426.7,313262.7,696347.1,1055987.6,50140.09,397495.0,⋯,9249.9370,612.4310,3622.381,509.3550,9421.3400,539.7440,0.0000,9421.3400,606.0825,G95964_RA4_01_31456.mzXML
G95965_RH3_01_31592.mzXML,2462434.252,2147674.492,757138.9,869053.039,669007.6,385612.5,763235.2,946280.3,82179.54,385041.4,⋯,8241.2495,643.9930,13709.553,868.1620,5333.6860,274.9320,290.1030,5333.6860,758.1500,G95965_RH3_01_31592.mzXML
G95984_repeat_RG10_01_31635.mzXML,13013.138,30427.820,1368443.0,0.000,942627.0,442855.9,1114556.2,2051366.7,371347.98,697540.2,⋯,218.7550,0.0000,3283.416,5658.4440,910.9350,1232.0360,0.0000,910.9350,11270.7260,G95984_repeat_RG10_01_31635.mzXML
G95960_repeat_RB10_01_31547.mzXML,4242.123,18359.641,1291443.2,0.000,776336.2,623491.6,1273871.2,4375694.9,302586.80,612339.0,⋯,0.0000,2698.8320,10652.775,0.0000,0.0000,3528.0005,631.2615,0.0000,3346.4075,G95960_repeat_RB10_01_31547.mzXML
G95952_BA6_01_31659.mzXML,15796.870,8924.331,1280366.6,0.000,856284.4,421872.2,973127.4,1608833.8,265316.19,420323.6,⋯,195.0410,1218.6345,1428.005,0.0000,1696.7560,1595.7875,391.7455,1696.7560,2737.6550,G95952_BA6_01_31659.mzXML
G95955_RE10_01_31522.mzXML,3535.226,24435.710,899953.1,0.000,1190917.1,369659.9,770328.7,4387261.6,188921.25,475794.6,⋯,183.4000,251.7130,1633.864,273.6295,0.0000,995.0430,255.6440,0.0000,6237.9725,G95955_RE10_01_31522.mzXML
G95953_BB6_01_31682.mzXML,9166.356,10258.182,1087247.8,0.000,1109143.6,311990.7,990233.5,1051595.9,289516.50,445752.1,⋯,0.0000,257.0875,9230.258,176.7420,0.0000,338.7555,188.2550,0.0000,14834.5825,G95953_BB6_01_31682.mzXML
G95961_BB10_01_31688.mzXML,9276.972,359453.696,481733.2,1816.282,838093.5,307517.6,691996.6,3312512.7,1174425.91,318427.0,⋯,2493.2375,18796.0345,7242.114,2129.3335,2935.5105,782.4490,1428.8005,2935.5105,9992.8410,G95961_BB10_01_31688.mzXML
G95956_repeat_RB5_01_31534.mzXML,6219.269,18477.978,1182390.4,0.000,1274634.5,388384.2,1149277.4,1549753.8,235634.83,459443.5,⋯,373.8060,949.5470,5528.860,264.8700,0.0000,1667.4205,1516.9805,0.0000,8635.9275,G95956_repeat_RB5_01_31534.mzXML


How many filenames in the feature table match with the metadata table

create output table containing only matching filenames 

In [19]:
final_tab <- merge(metadata,mat_MS1dat,by="filename")

create output table containing all files within the feature table

In [20]:
final_tab_wblanks <- merge(metadata,mat_MS1dat,by="filename",all.y=T)
final_tab_wblanks <- final_tab_wblanks[rowSums(is.na(final_tab_wblanks)) != (ncol(final_tab_wblanks)-1), ]

write tables to file

In [21]:
write.csv(final_tab,"FeatureTable_G4_yogurt_Clean.csv",row.names=F,quote=F)

In [22]:
write.csv(final_tab_wblanks,"FeatureTable_G4_yogurt_Clean_wblanks.csv",row.names=F,quote=F)