In [None]:
#r "nuget: FSharp.Stats, 0.4.3"
#r "nuget: BioFSharp, 2.0.0-beta5"
#r "nuget: BioFSharp.IO, 2.0.0-beta5"
#r "nuget: Plotly.NET, 4.2.0"
#r "nuget: Deedle, 2.5.0"
#r "nuget: Plotly.NET.Interactive, 4.2.0"
#r "nuget: ARCtrl"
#r "nuget: ARCtrl.NET, 1.0.5"
#r "nuget: ARCtrl.QueryModel, 1.0.5"

open System.IO
open Deedle
open BioFSharp
open FSharpAux
open FSharp.Stats
open Plotly.NET
open ARCtrl
open ARCtrl.NET
open ARCtrl.QueryModel
open ARCtrl.ISA

## I. Reading the sample description

Before we analyze our data, we will download and read the sample description provided by the experimentalist.

In [None]:
// let path = @"..\"

// let arc = ARC.load path
// let i = arc.ISA.Value

In [None]:
let normalizeFileName (f: string) = if Path.HasExtension f then f else Path.ChangeExtension(f, "wiff")

//        
// let getStrain (fileName: string) =
//     let fN = fileName |> normalizeFileName
//     i.ArcTables.CharacteristicsOf(fN,"Cultivation")
//         .WithName("strain")
//         .[0]
//         .ValueText

// //
// let getExpressionLevel (fileName: string) =
//     let fN = fileName |> normalizeFileName 
//     i.ArcTables.CharacteristicsOf(fN,"Cultivation")
//         .WithName("gene expression")
//         .[0]
//         .ValueText

let getGroupID (fileName: string) =
    // let fN = fileName |> normalizeFileName
    // i.ArcTables.ParametersOf(fN,"Protein extraction")
    //     .WithName("Group name")
    //     .[0]
    //     .ValueText
    fileName.Split("_").[2].Split(".").[0]

let getDilutionFactor (fileName : string) =
    // let fN = fileName |> normalizeFileName
    // i.ArcTables.ParametersOf(fN,"Dilution Series")
    //     .WithName("Dilution Factor")
    //     .[0]
    //     .ValueText
    fileName.Split("_").[1]

A quick execution to test the retrieval of data from the isa sample table:

In [None]:

// getStrain "Whole Cell UVM4 1to5 G3"
// getExpressionLevel "Whole Cell UVM4 1to5 G3"
// getGroupID "Whole Cell UVM4 1to5 G3"
// getDilutionFactor "Whole Cell UVM4 1to5 G3"



Now that we have the sample sheet, all that is missing is the data to be analyzed:

In [None]:
let path = @"C:\Users\jonat\Downloads\QProt_Annotated.csv"


## II. Raw data access using Deedle:
As teasered in the primer, we want to work with our tabular data using Deedle. Luckily, Deedle does not only deliver data frame and series
manipulation, but also allows us to quickly read the recently downloaded data into the memory:

In [None]:
let rawData = Frame.ReadCsv(path, separators = "\t")

To visualize the data, we can call the "Print()" function.

In [None]:
rawData.Print()

Looking at the raw data, we can see that each row contains a different quantification of a peptide ion, with the columns containing 
a single ion feature each, such as peptide ion charge, sequence or a quantification value reported for a file (e.g. light, heavy or ratio).
Since the columns ProteinGroup, StringSequence, PepSequenceID and Charge uniquely identify a row, we can use these to index the rows.
For this, we use a language feature called ["anonymous record type"](https://docs.microsoft.com/en-us/dotnet/fsharp/language-reference/anonymous-records). 
Here we create a tuple like structure, with the additional feature that each element of the tuple is named (e.g.: Proteingroup).

In [None]:
let indexedData =
    rawData
    // StringSequence is the peptide sequence
    |> Frame.indexRowsUsing (fun os -> 
            {|
                ProteinGroup    = os.GetAs<string>("ProteinGroup"); 
                Synonyms        = os.GetAs<string>("Synonym")
                StringSequence  = os.GetAs<string>("StringSequence");
                PepSequenceID   = os.GetAs<int>("PepSequenceID");
                Charge          = os.GetAs<int>("Charge")
            |}
        )
        
let inline printIndexedData (f: Frame<{| Charge: int; PepSequenceID: int; ProteinGroup: string; StringSequence: string; Synonyms: string |},string>) =
    f
    |> Frame.mapRowKeys (fun k -> $"{k.ProteinGroup},{k.Synonyms},{k.StringSequence},{k.PepSequenceID},{k.Charge}")
    |> fun f -> f.Print()

In [None]:
// The effect of our frame manipulation can be observed:
indexedData
|> printIndexedData

In [None]:
// this matching is important for the distinction later on
type Qprot = 
    | CBB
    | PS 

let finalRaw = 
    indexedData
    |> Frame.mapRowKeys (fun k ->
        let qprot = 
            match k.ProteinGroup |> String.contains "QProt_newCBB", k.ProteinGroup |> String.contains "QProt_newPS" with 
            // if contains CBB and not PS -> CBB
            | true, false  -> Some CBB
            // if contains no CBB but PS -> PS 
            | false, true  -> Some PS 
            // if anything else (eg false,false -> None)
            | _ -> None  
        {|k with QProt = qprot|}
        )
    |> Frame.filterRows (fun k s -> k.QProt.IsSome)
    |> Frame.mapRowKeys (fun k -> {|k with QProt = k.QProt.Value|})

let inline printIndexedQProtData (f: Frame<{| Charge: int; PepSequenceID: int; ProteinGroup: string; QProt: Qprot; StringSequence: string; Synonyms: string |},string>) =
    f
    |> Frame.mapRowKeys (fun k -> $"{k.ProteinGroup},{k.Synonyms},{k.StringSequence},{k.PepSequenceID},{k.Charge},{k.QProt}")
    |> fun f -> f.Print()

In [None]:
finalRaw
|> printIndexedQProtData

## IV. Quality control.

With our data frame prepared, we want to see if our dilution experiment worked.
We plot the overall mean of the 14N and 15N quantifications and observe if we can recover our dilution series (15N),
while keeping the analyte to be quantified at a constant level (14N).

Since it comes in handy to simplify the data frame, we will only keep columns that contain a specific identifier, 
such as, "Ratio", "Light" or "Heavy". 

In [None]:
let sliceQuantColumns quantColID frame = 
    frame
    |> Frame.filterCols (fun ck os -> ck |> String.contains ("." + quantColID))
    |> Frame.mapColKeys (fun ck -> ck.Split('.') |> Array.item 0)

// How did the data frame change, how did the column headers change?
// get everything that had Ratio in the Name (eg WCGr_5F_1.Ratio), cut everything else & remove Ratio from Name 
let ratios = sliceQuantColumns "Ratio" finalRaw 
// get all columns with light 
let light  = sliceQuantColumns "Quant_Light" finalRaw
// get all columns with heavy 
let heavy  = sliceQuantColumns "Quant_Heavy" finalRaw


In [None]:
ratios
|> printIndexedQProtData

In [None]:

type PeptideIon = 
    {|
        ProteinGroup    : string  
        Synonyms        : string
        StringSequence  : string
        PepSequenceID   : int
        Charge          : int
        QProt           : Qprot
    |}

let createBoxPlot (qprot:Qprot) (f:Frame<PeptideIon,string>) = 

        f
        |> Frame.filterRows (fun k s -> k.QProt = qprot)
        |> Frame.getNumericCols
        |> Series.map (fun k s -> 
            let x,y =
                s
                |> Series.values 
                |> Seq.map (fun values -> $"{getDilutionFactor k}, {getGroupID k}",values)
                |> Seq.unzip
            Chart.BoxPlot(X = x, Y = y, Orientation = StyleParam.Orientation.Vertical)
            )
        |> Series.values
        |> Chart.combine
        |> Chart.withYAxisStyle "Ion intensity"
        |> Chart.withLegend(false)

In [None]:
// specify QProt you want to have a look at
filteredRatios
|> createBoxPlot PS

In [None]:

filteredRatios
|> createBoxPlot CBB

In [None]:
let createLineChart (qprot:Qprot) (groupID: string) (f:Frame<PeptideIon,string>) = 

        f
        |> Frame.filterRows (fun k s -> k.QProt = qprot)
        |> Frame.filterCols (fun ck s -> getGroupID ck = groupID)
        |> Frame.transpose
        |> Frame.getNumericCols
        |> Series.map (fun k s -> 
            s
            |> Series.observations
            |> Seq.map (fun (k, v) -> getDilutionFactor k, v)
            |> Chart.Line
            |> Chart.withTraceInfo($"{k.StringSequence}, {k.Charge}, {k.Synonyms}")
        )
        |> Series.values
        |> Chart.combine
        |> Chart.withYAxisStyle "Ion intensity"
        |> Chart.withSize(1600,900)

In [None]:
filteredRatios
|> createLineChart PS "G1"