## Changes

1. both the fdr and the p-value needs to meet the threshold

In [4]:
import pandas as pd


def parse_columns(columns):
    """Parsing column names to cell lines and replicates

    Behaviour: 'SIDM00049_CPID1214.gene_summary.txt_neg|fdr' -> 
    {
        column: 'SIDM00049_CPID1214.gene_summary.txt_neg|fdr', 
        cellLine: SIDM00146,
        replicate: CPID1214,
        stat: fdr
    }

    This then converted into a spark dataframe

    args:
    columns (list): a list with column names from the LogFC file

    Returns:
    spark.dataframe
    """

    # These columns are not interesting:
    drop = ['id', 'Note1', 'Note2', 'num']

    # These are the values
    keys = ['cellLine', 'replicate', 'stat', 'column']

    # Filter relevant columns:
    columns = [x for x in columns if x not in drop]
    print(f'Number of columns: {len(columns)}')
    print(f'An example: {columns[2]}')

    # Parsig values:
    try:
        parsedColumns = [dict(zip(keys, x.replace(
            '.gene_summary.txt_neg|', '_').split('_') + [x])) for x in columns]
        parsedColumns_df = pd.DataFrame(parsedColumns)
        parsedColumns_df = parsedColumns_df.loc[parsedColumns_df.stat.isin(
            ['p-value', 'fdr', 'lfc', 'goodsgrna'])]

    except error as e:
        raise e(f"Failed to parse the following columns: {', '.join(columns)}")

    return parsedColumns_df


def parse_targets(gene_pair):
    genes = gene_pair.split('~')

    assert(len(genes) == 2)
    parsed = []

    for i, v in enumerate(genes):
        parsed.append({
            'targetFromSourceId': v,
            'geneticBackground': genes[1] if i == 0 else genes[0]
        })

    return parsed


def parse_replicates(row):
    returnValue = []
    for cellLine in xf.cellLine.unique():
        isSignificant = False
        replicates = []
        for replicate in xf.loc[xf.cellLine == cellLine].replicate.unique():

            repl_columns = (
                xf.loc[
                    (xf.cellLine == cellLine)
                    & (xf.replicate == replicate)]
                .set_index('stat').column
            )

            pval = row[repl_columns['p-value']]
            lfc = row[repl_columns['lfc']]
            fdr = row[repl_columns['fdr']]
            isSignificantRepl = False

            if (pval < 0.05) & (fdr < 0.05):
                isSignificant = True
                isSignificantRepl = True

            replicates.append({
                'replicteId': replicate,
                "logFoldChange": lfc,
                "pValue": pval,
                "falseDiscoveryRate": fdr,
                "isSignificant": isSignificantRepl
            })

        if isSignificant:
            returnValue.append({
                'cellModelId': cellLine,
                'replicates': replicates
            })

    if isSignificant:
        return returnValue
    else:
        return None

In [193]:
# Get fixed values
input_file = '/Users/dsuveges/project_data/Shared_Data/OTAR2062/Aug21_ExactMatchData/BRCA-1_ExactMatch_LogFC_All.gene.stats.annotated.txt'
filter_column = ['lfc', 'p-value']

# Get list of files
# These maps needs to be finalized later:
CELL_MAP = {
    'SIDM00146': {
        'diseaseFromSourceMappedId': 'NCIT_C9140', 
        'tissue': 'bone marrow',
        'biosamplesFromSource': [ # Might not be suitable here
            'UBERON_0002371'  # bone marrow
        ], 
    }, # https://cellmodelpassports.sanger.ac.uk/passports/SIDM01076
}

# Get a list of hits from the log2fold dataset


# Get the relevant values from the GEMINI and BLISS datasets


# Format and output the data

In [194]:
import pandas as pd

In [203]:
# Reading datafile:
df = pd.read_csv(input_file, sep=' ')

# Extracting columns:
columns = df.columns

# Parse columns:
xf = parse_columns(columns)

# 
df_filtered = (
    df.head()
    .assign(parsed = df.apply(parse_replicates, axis=1))
    .loc[lambda df: df['parsed'].notna()]
    .explode('parsed')
    .assign(targets = lambda df: df['id'].apply(parse_targets))
    .explode('targets')
)
df_filtered.head()

Number of columns: 464
An example: SIDM00049_CPID1211.gene_summary.txt_neg|goodsgrna


KeyboardInterrupt: 

In [176]:
{x[1] for x in df.columns[4:].str.split('|')}

{'fdr', 'goodsgrna', 'lfc', 'p-value'}

In [151]:
parsed_combined = (
    pd.concat([
        df_filtered.parsed.apply(pd.Series),
        df_filtered.targets.apply(pd.Series)], axis=1)
    .assign(    
        datatypeId = "ot_partner",
        datasourceId = "encore",
        projectId = "OTAR2062",
        diseaseFromSourceId = 'NCIT_C9140', # This needs to be reviewed.
    )
)

parsed_combined = (
    pd.concat([
        parsed_combined,
        parsed_combined.cellModelId.map(CELL_MAP).apply(pd.Series)], axis=1)
    .drop(['tissue'], axis=1)
)

parsed_combined.head()

Unnamed: 0,cellModelId,replicates,targetFromSourceId,geneticBackground,datatypeId,datasourceId,projectId,diseaseFromSourceId,diseaseFromSourceMappedId,biosamplesFromSource
59,SIDM00146,"[{'replicteId': 'CPID1310', 'logFoldChange': -...",ABL2,CDK7,ot_partner,encore,OTAR2062,NCIT_C9140,NCIT_C9140,[UBERON_0002371]
59,SIDM00146,"[{'replicteId': 'CPID1310', 'logFoldChange': -...",CDK7,ABL2,ot_partner,encore,OTAR2062,NCIT_C9140,NCIT_C9140,[UBERON_0002371]
60,SIDM00146,"[{'replicteId': 'CPID1310', 'logFoldChange': -...",ABL2,CHEK1,ot_partner,encore,OTAR2062,NCIT_C9140,NCIT_C9140,[UBERON_0002371]
60,SIDM00146,"[{'replicteId': 'CPID1310', 'logFoldChange': -...",CHEK1,ABL2,ot_partner,encore,OTAR2062,NCIT_C9140,NCIT_C9140,[UBERON_0002371]
131,SIDM00146,"[{'replicteId': 'CPID1310', 'logFoldChange': 0...",ADAD1,AKT1,ot_partner,encore,OTAR2062,NCIT_C9140,NCIT_C9140,[UBERON_0002371]


In [148]:
parsed_combined.head().cellModelId.map(CELL_MAP).apply(pd.Series)

Unnamed: 0,diseaseFromSourceMappedId,tissue,biosamplesFromSource
59,NCIT_C9140,bone marrow,[UBERON_0002371]
59,NCIT_C9140,bone marrow,[UBERON_0002371]
60,NCIT_C9140,bone marrow,[UBERON_0002371]
60,NCIT_C9140,bone marrow,[UBERON_0002371]
131,NCIT_C9140,bone marrow,[UBERON_0002371]


In [149]:
parsed_combined.head().concat(parsed_combined.head().cellModelId.map(CELL_MAP).apply(pd.Series))

AttributeError: 'DataFrame' object has no attribute 'concat'

In [160]:
x = "SIDM00049_CPID1214.gene_summary.txt_neg|fdr"
x.replace('.gene_summary.txt_neg|', '_').split('_')

['SIDM00049', 'CPID1214', 'fdr']

In [190]:
# These columns are not interesting:
drop = ['id', 'Note1', 'Note2', 'num']

# These are the values 
keys = ['cellLine', 'replicate', 'stat', 'column']

# Filter relevant columns:
columns = [x for x in columns if x not in drop]
print(f'Number of columns: {len(columns)}')
print(f'An example: {columns[2]}')

# Parsig values:
try:
    parsedColumns = [dict(zip(keys, x.replace('.gene_summary.txt_neg|', '_').split('_') + [x])) for x in columns]
    parsedColumns_df = pd.DataFrame(parsedColumns)
    parsedColumns_df = parsedColumns_df.loc[parsedColumns_df.stat.isin(['p-value', 'fdr', 'lfc', 'goodsgrna'])]
except error as e:
    raise e(f"Failed to parse the following columns: {', '.join(columns)}")


Number of columns: 464
An example: SIDM00049_CPID1211.gene_summary.txt_neg|goodsgrna


In [187]:
x = columns[0]
dict(zip(keys, x.replace('.gene_summary.txt_neg|', '_').split('_') + [x]))

{'cellLine': 'SIDM00049',
 'replicate': 'CPID1211',
 'stat': 'p-value',
 'column': 'SIDM00049_CPID1211.gene_summary.txt_neg|p-value'}

In [191]:
parsedColumns_df.stat.value_counts()

lfc          86
goodsgrna    86
fdr          86
p-value      86
Name: stat, dtype: int64

In [201]:
len(xf.replicate.unique())

86

In [206]:
xf.loc[xf.cellLine == 'SIDM01090'].replicate.unique()

array(['CPID1011', 'CPID1014', 'CPID1017'], dtype=object)

In [208]:
(
    xf[['cellLine', 'replicate']]
    .drop_duplicates()
    .groupby('cellLine')
    .count()
)

Unnamed: 0_level_0,replicate
cellLine,Unnamed: 1_level_1
SIDM00049,3
SIDM00118,3
SIDM00136,9
SIDM00193,3
SIDM00194,3
SIDM00214,3
SIDM00359,3
SIDM00537,3
SIDM00677,3
SIDM00680,3


In [210]:
df.Note2.value_counts()

LibraryCombinations          16160
AnchorCombinations            1600
LibrarySingletons             1212
ESSENTIAL-NONTARGET            400
NONTARGET-NONTARGET            199
GIControlsSingletons           198
INTERGENIC-INTERGENIC          155
NONESSENTIAL-NONESSENTIAL      136
AnchorSingletons               120
ESSENTIAL-INTERGENIC            92
GIControlsCombinations          62
Name: Note2, dtype: int64

In [212]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
#Create spark session
data = [("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), \
      ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), \
      ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), \
      ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")]

columns= ["Product","Amount","Country"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)



root
 |-- Product: string (nullable = true)
 |-- Amount: long (nullable = true)
 |-- Country: string (nullable = true)

+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
|Banana |1000  |USA    |
|Carrots|1500  |USA    |
|Beans  |1600  |USA    |
|Orange |2000  |USA    |
|Orange |2000  |USA    |
|Banana |400   |China  |
|Carrots|1200  |China  |
|Beans  |1500  |China  |
|Orange |4000  |China  |
|Banana |2000  |Canada |
|Carrots|2000  |Canada |
|Beans  |2000  |Mexico |
+-------+------+-------+



In [213]:
pivotDF = df.groupBy("Product").pivot("Country").sum("Amount")
pivotDF.printSchema()
pivotDF.show(truncate=False)

root
 |-- Product: string (nullable = true)
 |-- Canada: long (nullable = true)
 |-- China: long (nullable = true)
 |-- Mexico: long (nullable = true)
 |-- USA: long (nullable = true)

+-------+------+-----+------+----+
|Product|Canada|China|Mexico|USA |
+-------+------+-----+------+----+
|Orange |null  |4000 |null  |4000|
|Beans  |null  |1500 |2000  |1600|
|Banana |2000  |400  |null  |1000|
|Carrots|2000  |1200 |null  |1500|
+-------+------+-----+------+----+



In [214]:
unpivotExpr = "stack(3, 'Canada', Canada, 'China', China, 'Mexico', Mexico) as (Country,Total)"
unPivotDF = pivotDF.select("Product", expr(unpivotExpr)) \
    .where("Total is not null")
unPivotDF.show(truncate=False)
unPivotDF.show()

+-------+-------+-----+
|Product|Country|Total|
+-------+-------+-----+
|Orange |China  |4000 |
|Beans  |China  |1500 |
|Beans  |Mexico |2000 |
|Banana |Canada |2000 |
|Banana |China  |400  |
|Carrots|Canada |2000 |
|Carrots|China  |1200 |
+-------+-------+-----+

+-------+-------+-----+
|Product|Country|Total|
+-------+-------+-----+
| Orange|  China| 4000|
|  Beans|  China| 1500|
|  Beans| Mexico| 2000|
| Banana| Canada| 2000|
| Banana|  China|  400|
|Carrots| Canada| 2000|
|Carrots|  China| 1200|
+-------+-------+-----+



In [1]:
index_cols= ["Hospital","Hospital Address"]
drop_cols = ['Record']
# Select all columns which needs to be pivoted down
pivot_cols = [c  for c in df.columns if c not in index_cols+drop_cols ]
pivot_cols

NameError: name 'df' is not defined

In [42]:
# create sample data 
import pandas as pd
from pyspark.sql.functions import  *
from pyspark.sql import SparkSession

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)


panda_df = pd.DataFrame({'Record': {0: 1, 1: 2, 2: 3},
 'Hospital': {0: 'Red Cross', 1: 'Alberta Hospital', 2: 'General Hospital'},
 'Hospital Address': {0: '1234 Street 429',
  1: '553 Alberta Road 441',
  2: '994 Random Street 923'},
 'Medicine_1': {0: 'Effective', 1: 'Effecive', 2: 'Normal'},
 'Medicine_2': {0: 'Effective', 1: 'Normal', 2: 'Effective'},
 'Medicine_3': {0: 'Normal', 1: 'Normal', 2: 'Normal'},
 'Medicine_4': {0: 'Effective', 1: 'Effective', 2: 'Effective'}})
df = spark.createDataFrame(panda_df)

# calculate
df.select("Hospital","Hospital Address", 
          expr("stack(4, 'Medicine_1', Medicine_1, 'Medicine_2', Medicine_2, \
          'Medicine_3', Medicine_3,'Medicine_4',Medicine_4) as (MedicinName, Effectiveness)")
         ).where("Effectiveness is not null").show()

+----------------+--------------------+-----------+-------------+
|        Hospital|    Hospital Address|MedicinName|Effectiveness|
+----------------+--------------------+-----------+-------------+
|       Red Cross|     1234 Street 429| Medicine_1|    Effective|
|       Red Cross|     1234 Street 429| Medicine_2|    Effective|
|       Red Cross|     1234 Street 429| Medicine_3|       Normal|
|       Red Cross|     1234 Street 429| Medicine_4|    Effective|
|Alberta Hospital|553 Alberta Road 441| Medicine_1|     Effecive|
|Alberta Hospital|553 Alberta Road 441| Medicine_2|       Normal|
|Alberta Hospital|553 Alberta Road 441| Medicine_3|       Normal|
|Alberta Hospital|553 Alberta Road 441| Medicine_4|    Effective|
|General Hospital|994 Random Street...| Medicine_1|       Normal|
|General Hospital|994 Random Street...| Medicine_2|    Effective|
|General Hospital|994 Random Street...| Medicine_3|       Normal|
|General Hospital|994 Random Street...| Medicine_4|    Effective|
+---------

In [8]:
## Template pyspark data
## 

panda_df = pd.DataFrame(
    {'Record': {0: 1, 1: 2, 2: 3},
 'Hospital': {0: 'Red Cross', 1: 'Alberta Hospital', 2: 'General Hospital'},
 'Hospital Address': {0: '1234 Street 429',
  1: '553 Alberta Road 441',
  2: '994 Random Street 923'},
 'Medicine_1': {0: 'Effective', 1: 'Effecive', 2: 'Normal'},
 'Medicine_2': {0: 'Effective', 1: 'Normal', 2: 'Effective'},
 'Medicine_3': {0: 'Normal', 1: 'Normal', 2: 'Normal'},
 'Medicine_4': {0: 'Effective', 1: 'Effective', 2: 'Effective'}})
df = spark.createDataFrame(panda_df)
df.show()

+------+----------------+--------------------+----------+----------+----------+----------+
|Record|        Hospital|    Hospital Address|Medicine_1|Medicine_2|Medicine_3|Medicine_4|
+------+----------------+--------------------+----------+----------+----------+----------+
|     1|       Red Cross|     1234 Street 429| Effective| Effective|    Normal| Effective|
|     2|Alberta Hospital|553 Alberta Road 441|  Effecive|    Normal|    Normal| Effective|
|     3|General Hospital|994 Random Street...|    Normal| Effective|    Normal| Effective|
+------+----------------+--------------------+----------+----------+----------+----------+



## Sample dataset

Generating a simple dataset to prototype stacking:

In [44]:
# Generate pandas dataframe
pdf = pd.DataFrame({
    'animal': ['cat', 'dog', 'chick'],
    'voice': ['miau', 'vau', 'csip'],
    
    # Data from farm 1
    'farm1_count': [1, 1, 12],
    'farm1_legs': [4, 4, 24],
    
    # Data from farm 2
    'farm2_count': [1, 1, 22],
    'farm2_legs': [4, 4, 44],
    
    # Data from farm 3
#     'farm3_count': [2, 5, 2],
#     'farm3_legs': [8, 20, 4],
})

# Convert to spark:
df = spark.createDataFrame(pdf)
df.show()

+------+-----+-----------+----------+-----------+----------+
|animal|voice|farm1_count|farm1_legs|farm2_count|farm2_legs|
+------+-----+-----------+----------+-----------+----------+
|   cat| miau|          1|         4|          1|         4|
|   dog|  vau|          1|         4|          1|         4|
| chick| csip|         12|        24|         22|        44|
+------+-----+-----------+----------+-----------+----------+



In [46]:
# # Generate the "unpivot" expression:
# unpivotExpr = "stack(3, 'Canada', Canada, 'China', China, 'Mexico', Mexico) as (Country,Total)"
# unPivotDF = pivotDF.select("Product", expr(unpivotExpr)) \
#     .where("Total is not null")
# unPivotDF.show(truncate=False)

unpivot_expression = '''stack(2, 'farm1', farm1, 'farm2', farm2) as (farm, data)'''
xs = map -> [[farm1_count|farm1_legs], [farm2_count|farm2_legs]] -> [('farm1', col), ()]

resDF = reduce(lambda DF,value: DF.withColumn(*value) , xs, df)

colum_operatins = [('farm1' ,struct(('farm1_count').alias('count'),col('farm1_legs').alias('legs')))]
(
    df.
    .withColumn('farm1', struct(col('farm1_count').alias('count'),col('farm1_legs').alias('legs')))
    .withColumn('farm2', struct(col('farm2_count').alias('count'),col('farm2_legs').alias('legs')))
    .select('animal', 'voice', expr(unpivot_expression))
    .select('*', 'data.*')
    .show()
)

+------+-----+-----+--------+-----+----+
|animal|voice| farm|    data|count|legs|
+------+-----+-----+--------+-----+----+
|   cat| miau|farm1|  {1, 4}|    1|   4|
|   cat| miau|farm2|  {1, 4}|    1|   4|
|   dog|  vau|farm1|  {1, 4}|    1|   4|
|   dog|  vau|farm2|  {1, 4}|    1|   4|
| chick| csip|farm1|{12, 24}|   12|  24|
| chick| csip|farm2|{22, 44}|   22|  44|
+------+-----+-----+--------+-----+----+



In [22]:
?stack

Object `stack` not found.


In [47]:
reduce

NameError: name 'reduce' is not defined