# Processing configuration for ot_crispr

- Currently study level configuration is in a google spreadsheet.
- This now needs to be converted into a json file.
- This json needs to be updated with some new fields.
- Also some updates are expected in some of the existing fields. eg contrast description.
- JSON file is then be strored in a private github repo.

In [53]:
import pandas as pd
import json
import yaml

# Source Google Sheet:
config_sheet = "https://docs.google.com/spreadsheets/d/1eow_rTizGY_Vda0Q-8sFNWCOyFkKRSUen1Em1ysPSiA/export?format=csv"

# Reading into a pandas dataframe, while skipping the first row, that contains column descriptions:
study_df = (
    pd.read_csv(config_sheet, skiprows=[1])
    .assign(
        # Split mapped diseases to list:
        diseaseFromSourceMappedId=lambda df: df.diseases.str.replace(' ', '').str.split('|'),

        # Split data files to list:
        dataFiles=lambda df: df.dataFile.str.replace(' ', '').str.split('|')
    )
    # Dropping unused fields:
    .drop(['diseases', 'dataFile'], axis=1)
)

# This is the data which is written into the json:
study_df.head()

Unnamed: 0,studyId,projectId,projectDescription,studyOverview,releaseVersion,releaseDate,isCellTypeDerived,crisprScreenLibrary,crisprStudyMode,geneticBackground,cellType,cellLineBackground,contrast,dataFileType,filterColumn,threshold,ControlDataset,diseaseFromSourceMappedId,dataFiles
0,OTAR036_TAU_uptake_1,OTAR036,iPSC neuron CRISPR,Comparison of functional populations of iPSC n...,v1.0,2021-06-10,yes,"Kosuke v1.1 (Behan et al., 2018)",CRISPRn,,iPSC derived cortical neurons,KOLF2-C1,TauNEGTransPOS (only transferrin taken up) vs ...,MAGeCK,pos|p-value,0.01,,"[MONDO_0004975, EFO_0003096]",[Screen4_5_NEGPOSvPOSPOS.gene_summary.txt]
1,OTAR036_TAU_uptake_2,OTAR036,iPSC neuron CRISPR,Comparison of functional populations of iPSC n...,v1.0,2021-06-10,yes,"Kosuke v1.1 (Behan et al., 2018)",CRISPRn,,iPSC derived cortical neurons,KOLF2-C1,TauPOSTransPOS (both proteins taken up) vs T...,MAGeCK,pos|p-value,0.01,,"[MONDO_0004975, EFO_0003096]",[Screen1.2.3.POSPOSvsNEGPOS.gene_summary.txt]
2,OTAR033_IL17A-TNFa_FACS_upper-tail,OTAR033,Keratinocytes,Differentiated HaCaT keratinocytes were stimul...,v1.0,2021-06-10,no,"Kosuke v1.1 (Behan et al., 2018)",CRISPRn,TP53 H179Y/ R282W. WGS available,HaCaT keratinocytes,HaCaT,S100A9 Positive vs Negative,MAGeCK,pos|fdr,0.2,hacat.differentiation.facs.gene_summary.txt,[EFO_0000676],[hacat.il17a.facs.gene_summary.txt]
3,OTAR033_IL17A-TNFa_dropout_upper-tail,OTAR033,Keratinocytes,Differentiated HaCaT keratinocytes were stimul...,v1.0,2021-06-10,no,"Kosuke v1.1 (Behan et al., 2018)",CRISPRn,TP53 H179Y/ R282W. WGS available,HaCaT keratinocytes,HaCaT,t=14 vs t=0,MAGeCK,pos|fdr,0.2,hacat.differentiation.droput.gene_summary.txt,[EFO_0000676],[hacat.il17a.dropout.gene_summary.txt]
4,OTAR033_IL4_FACS_upper-tail,OTAR033,Keratinocytes,Differentiated HaCaT keratinocytes were stimul...,v1.0,2021-06-10,no,"Kosuke v1.1 (Behan et al., 2018)",CRISPRn,TP53 H179Y/ R282W. WGS available,HaCaT keratinocytes,HaCaT,IL13RA2 Positive vs Negative,MAGeCK,pos|fdr,0.2,hacat.differentiation.facs.gene_summary.txt,[EFO_0000274],[hacat.il4.facs.gene_summary.txt]


In [69]:
# Saving the table:
study_df.to_json('~/repositories/PPP-evidencie-configuration/ot_crispr_config.json', orient='records', indent=4, index=True)

# Let's try to read the data:
pd.read_json('~/repositories/PPP-evidencie-configuration/ot_crispr_config.json', orient='records')

# Let's try to read as dictionary
with open('~/repositories/PPP-evidencie-configuration/ot_crispr_config.json', 'rt') as f:
    config_df = json.load(f)

In [71]:
# How do a single element looks like:
print(json.dumps(config_df[1], indent=4))

{
    "studyId": "OTAR036_TAU_uptake_2",
    "projectId": "OTAR036",
    "projectDescription": "iPSC neuron CRISPR",
    "studyOverview": "Comparison of functional populations of iPSC neurons based on their ability to take up monomeric/aggregated tau protein",
    "releaseVersion": "v1.0",
    "releaseDate": "2021-06-10",
    "isCellTypeDerived": "yes",
    "crisprScreenLibrary": "Kosuke v1.1 (Behan et al., 2018)",
    "crisprStudyMode": "CRISPRn",
    "geneticBackground": null,
    "cellType": "iPSC derived cortical neurons",
    "cellLineBackground": "KOLF2-C1",
    "contrast": " TauPOSTransPOS  (both proteins taken up) vs TauNEGTransPOS (only transferrin taken up)",
    "dataFileType": "MAGeCK",
    "filterColumn": "pos|p-value",
    "threshold": 0.01,
    "ControlDataset": null,
    "diseaseFromSourceMappedId": [
        "MONDO_0004975",
        "EFO_0003096"
    ],
    "dataFiles": [
        "Screen1.2.3.POSPOSvsNEGPOS.gene_summary.txt"
    ]
}


In [75]:
import pandas as pd
import json
import requests

from pyspark.sql.types import ArrayType, StringType, IntegerType
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

def clear_n_split(row: f.col) -> f.col:
    return f.split(
                f.regexp_replace(row, ' ', ''), '|'
    )

(
    spark.read.csv('/Users/dsuveges/repositories/evidence_datasource_parsers/ot_study.csv', sep=',', header=True)
    .filter(f.col('diseases').rlike(r'[A-Z]+_[0-9]+'))
    .withColumn('diseaseFromSourceMappedId', clear_n_split(f.col('diseases')))
    .withColumn('dataFiles', clear_n_split(f.col('dataFile')))
    # Dropping unused fields:
    .drop('diseases', 'dataFile')
    .printSchema()
)

root
 |-- studyId: string (nullable = true)
 |-- projectId: string (nullable = true)
 |-- projectDescription: string (nullable = true)
 |-- studyOverview: string (nullable = true)
 |-- releaseVersion: string (nullable = true)
 |-- releaseDate: string (nullable = true)
 |-- isCellTypeDerived: string (nullable = true)
 |-- crisprScreenLibrary: string (nullable = true)
 |-- crisprStudyMode: string (nullable = true)
 |-- geneticBackground: string (nullable = true)
 |-- cellType: string (nullable = true)
 |-- cellLineBackground: string (nullable = true)
 |-- contrast: string (nullable = true)
 |-- dataFileType: string (nullable = true)
 |-- filterColumn: string (nullable = true)
 |-- threshold: string (nullable = true)
 |-- ControlDataset: string (nullable = true)
 |-- diseaseFromSourceMappedId: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- dataFiles: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [74]:
(
    spark.read.json('/Users/dsuveges/repositories/PPP-evidencie-configuration/ValidationLab_config.json')
    .printSchema()
)

root
 |-- _corrupt_record: string (nullable = true)

