# Pipeline

In [None]:
!nvidia-smi

In [2]:
import os 
import dlpro
from dlpro import constants, data, eval, layers, models, pipelines, reports, utils
import tensorflow as tf
from dlpro.eval.rt_eval import delta95_metric
from dlpro.eval.rt_eval import TimeDeltaMetric
from dlpro.data.RetentionTimeDataset import RetentionTimeDataset
from dlpro.models.prosit import PrositRetentionTimePredictor
from dlpro.reports.RetentionTimeReport import RetentionTimeReport

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib.ticker import LogLocator

import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

from matplotlib.offsetbox import AnchoredText

%matplotlib inline

# To save the history as dict
# import pickle

import functions # -> the functions written for this analysis are being called from functions.py file in dlpro 

In [None]:
RAW_FILE_PATH = "/scratch/yangyang_0.01FDR/evidence.txt" # raw data location from the server to filer it for further use 

PROSIT_DATAPATH = "/scratch/prosit_original/data.csv"

BASE_MODEL_WEIGHTS = "./prosit_ekin_training_best_checkpoint"

TEST_DATAPATH = "/scratch/dongxue_tissue_1.0FDR/dongxue_P013129_no_fdr_irt_holdout_data.csv" # do the denormalization with this data when predict with base model

TRAIN_DATAPATH = "/scratch/dongxue_tissue_0.01FDR/dongxue_P013129_irt_train_data.csv"

DATA_TO_BE_INDEXED = "./yangyang_0.01FDR/filtered_yangyang_0.01FDR.csv" # should be a filtered data

refinement_training_data = RetentionTimeDataset(data_source=TRAIN_DATAPATH,
                              pad_length=30, batch_size=1024, val_ratio=0.25, test=False, sample_run=False)

test_data = RetentionTimeDataset(data_source = TEST_DATAPATH,
                              pad_length=30, batch_size=64, test=True)


## Filtering raw files
    > Filtering according to duplicates with low scores

In [None]:
raw_data_filter(RAW_FILE_PATH, new_folder_name = "yangyang_0.01FDR") # both 0.01 and 1.0 FDR files should be filtered

### Visualizing the Data
    > The visualization can be used with any files with sequence and irt columns (or rt). It will show the histograms of rt value and sequence length distributions

In [None]:
distributionVisualization("./yangyang_0.01FDR/","filtered_yangyang_0.01FDR.csv",header="Yangyang 0.01 FDR")

## Indexing a dataset

In [None]:
# this should be the location of filtered data inside of the folder created by raw_data_filter() function
# and this data should contain only rt values, meaning not indexed 
FILTERED_DATAPATH = "./dongxue_1.0_FDR/filtered_dongxue_1.0_FDR.csv"

In [None]:
reference_DF = indexingDataPreparation(FILTERED_DATAPATH,PROSIT_DATAPATH) 
# creates indexing data frame to create linear reg model

In [None]:
rgr = build_regression_model(reference_DF) # creates a regression model with provided df

In [None]:
indexandsplit(DATA_TO_BE_INDEXED, rgr, irt_data_name="indexed_yangyang_0.01FDR")

## Refining a model

In [None]:
modelRefine(rtdata, BASE_MODEL_WEIGHTS,new_weights_file_name="yangyang_0.01FDR_refinement",
            learning_rate=0.0000001,SEQ_LENGTH = 30,epoch_number=1)

In [None]:
# to see the refinement history, just read the history DF that the modelRefine() function created
history = pd.read_csv("yangyang_0.01FDR_refinement_historyDF")

In [None]:
history

## Predicting with a model 
    > This does not have to be the refined model, but the parameters should be defined accordingly

In [None]:
REFINED_MODEL_WEIGHTS_PATH = "./dongxue_indexed_P013129_refinement/weight_118_0.04123" 
# the folder created automatically with modelRefine() function (by given name to that function)

In [None]:
model_predictions,test_targets = modelPredict(REFINED_MODEL_WEIGHTS_PATH,test_data,refinement_training_data)

In [None]:
density_plot(model_predictions,test_targets,header = "Dongxue Refined Model Test with \nDongxue 1.0 FDR Holdout")

## Creating Predictions Data Frame for SVM analysis

In [None]:
predictionDataFrame(model_predictions, test_targets, test_rtdata, modelName ="DongxueRefined")

In [None]:
# this location should adress the prediction data frame created with the predictionDataFrame() function above 
PREDICTION_DF_LOCATION = "./SVM_DataFrames/DongxueRefinedModelPredictionDF.csv"

In [None]:
# the data to be used in the svmFormatter() function should be the indexed and filtered file of the predicted peptides 
# that file will come from indexandsplit() function, created with the given irt_data_name parameter of the function
INDEXED_AND_FILTERED_DATAPATH = "./" 

In [None]:
svmFormatter(INDEXED_AND_FILTERED_DATAPATH, PREDICTION_DF_LOCATION, header = "DongxueRefined")

## mokapot analysis 
    > Identification follows the pipeline, but could not install and import mokapot package here

In [None]:
import mokapot

In [None]:
psms = mokapot.read_pin("phospho_rep1.pin")

In [None]:
# results, models = mokapot.brew(psms)

In [None]:
# results.to_txt()