In [15]:
from functools import reduce
import pandas as pd
from pyspark.sql.functions import col, udf, struct, lit, split
from pyspark.sql.types import FloatType, ArrayType, StructType, StructField
from pyspark.sql import SparkSession
from collections import defaultdict

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)


folder = '/Users/dsuveges/project_data/Shared_Data/OTAR2062/Aug21_ExactMatchData'

## 
## Overlap between the BRCA files 
##

files = [
    'BRCA-1_ExactMatch_Gemini_SENSITIVE_LETHALITY_ALL.txt',
    'BRCA-1_ExactMatch_Gemini_STRONG_ALL.txt',
    'BRCA-1_ExactMatch_Gemini_SENSITIVE_RECOVERY_ALL.txt', #
    'BRCA-1_ExactMatch_LogFC_All.gene.stats.annotated.txt' # Get this.
]

# Read all files and generate a list of file header:
headers = {}
for file in files:
    headers[file] = (
        spark.read.csv(f'{folder}/{file}', sep=' ', header=True)
        .columns
    )
    


In [35]:
from collections import defaultdict

cleaned_header = defaultdict(list)
default_values = {
    file: False for file in headers.keys()
}

for file, header in headers.items():
    cell_lines = list({x.split('_')[0] for x in header})
    for cell_line in cell_lines:
        try:
            cleaned_header[cell_line][file] = True
        except:
            cleaned_header[cell_line] = default_values.copy()
            cleaned_header[cell_line][file] = True
    
    
print(
    pd.DataFrame(cleaned_header)
    .transpose()
    .rename(columns={
        'BRCA-1_ExactMatch_Gemini_SENSITIVE_LETHALITY_ALL.txt': 'GEMINI_lethality',
        'BRCA-1_ExactMatch_Gemini_STRONG_ALL.txt': 'GEMINI_strong',
        'BRCA-1_ExactMatch_Gemini_SENSITIVE_RECOVERY_ALL.txt': 'GEMINI_recovery',
        'BRCA-1_ExactMatch_LogFC_All.gene.stats.annotated.txt': 'LogFC'
    })
    .sort_values('LogFC')
    .to_markdown()
)

|           |   GEMINI_lethality |   GEMINI_strong |   GEMINI_recovery |   LogFC |
|:----------|-------------------:|----------------:|------------------:|--------:|
| SIDM00146 |                  1 |               1 |                 1 |       0 |
| SIDM00272 |                  1 |               1 |                 1 |       0 |
| SIDM01042 |                  1 |               1 |                 1 |       0 |
| SIDM00875 |                  1 |               1 |                 1 |       0 |
| Gene      |                  1 |               1 |                 1 |       0 |
| SIDM00957 |                  0 |               0 |                 0 |       1 |
| SIDM00214 |                  0 |               0 |                 0 |       1 |
| SIDM00833 |                  0 |               0 |                 0 |       1 |
| SIDM00681 |                  0 |               0 |                 0 |       1 |
| SIDM00834 |                  0 |               0 |                 0 |       1 |
| SI

In [23]:
all_files = list(headers.keys())
# for cell, files in cleaned_header:
all_files

['BRCA-1_ExactMatch_Gemini_SENSITIVE_LETHALITY_ALL.txt',
 'BRCA-1_ExactMatch_Gemini_STRONG_ALL.txt',
 'BRCA-1_ExactMatch_Gemini_SENSITIVE_RECOVERY_ALL.txt',
 'BRCA-1_ExactMatch_LogFC_All.gene.stats.annotated.txt']

In [42]:
folder = '/Users/dsuveges/project_data/Shared_Data/OTAR2062/Aug21_ExactMatchData'

## 
## Overlap between the BRCA files 
##

files = [
    'COLO-1_ExactMatch_Gemini_SENSITIVE_LETHALITY_ALL.txt',
    'COLO-1_ExactMatch_Gemini_STRONG_ALL.txt',
    'COLO-1_ExactMatch_Gemini_SENSITIVE_RECOVERY_ALL.txt', #
    'COLO-1_ExactMatch_LogFC_All.gene.stats.annotated.txt' # Get this.
]

# Read all files and generate a list of file header:
headers = {}
for file in files:
    headers[file] = (
        spark.read.csv(f'{folder}/{file}', sep=' ', header=True)
        .columns
    )
    

cleaned_header = defaultdict(list)
default_values = {
    file: False for file in headers.keys()
}

for file, header in headers.items():
    cell_lines = list({x.split('_')[0] for x in header})
    for cell_line in cell_lines:
        try:
            cleaned_header[cell_line][file] = True
        except:
            cleaned_header[cell_line] = default_values.copy()
            cleaned_header[cell_line][file] = True
    
    
print(
    pd.DataFrame(cleaned_header)
    .transpose()
    .rename(columns={
        'COLO-1_ExactMatch_Gemini_SENSITIVE_LETHALITY_ALL.txt': 'GEMINI_lethality',
        'COLO-1_ExactMatch_Gemini_STRONG_ALL.txt': 'GEMINI_strong',
        'COLO-1_ExactMatch_Gemini_SENSITIVE_RECOVERY_ALL.txt': 'GEMINI_recovery',
        'COLO-1_ExactMatch_LogFC_All.gene.stats.annotated.txt': 'LogFC'
    })
    .sort_values('LogFC')
    .to_markdown()
)

|           |   GEMINI_lethality |   GEMINI_strong |   GEMINI_recovery |   LogFC |
|:----------|-------------------:|----------------:|------------------:|--------:|
| Gene      |                  1 |               1 |                 1 |       0 |
| SIDM00118 |                  1 |               1 |                 1 |       1 |
| SIDM00049 |                  1 |               1 |                 1 |       1 |
| SIDM00778 |                  1 |               1 |                 1 |       1 |
| SIDM00783 |                  1 |               1 |                 1 |       1 |
| SIDM00359 |                  1 |               1 |                 1 |       1 |
| SIDM00782 |                  1 |               1 |                 1 |       1 |
| SIDM00537 |                  1 |               1 |                 1 |       1 |
| SIDM00837 |                  1 |               1 |                 1 |       1 |
| SIDM00677 |                  1 |               1 |                 1 |       1 |
| SI

In [44]:
folder = '/Users/dsuveges/project_data/Shared_Data/OTAR2062/Aug21_AggregatedData/'

## 
## Overlap between the BRCA files 
##

files = [
    'COLO-1_ExactMatch_GEMINI_STRONG_ALL.txt',
    'COLO-1_ExactMatch_LogFC_Cell-Line-Average_All.gene.stats.annotated.txt' # Get this.
]

# Read all files and generate a list of file header:
headers = {}
for file in files:
    headers[file] = (
        spark.read.csv(f'{folder}/{file}', sep=' ', header=True)
        .columns
    )
    

cleaned_header = defaultdict(list)
default_values = {
    file: False for file in headers.keys()
}

for file, header in headers.items():
    cell_lines = list({x.split('_')[0] for x in header})
    for cell_line in cell_lines:
        try:
            cleaned_header[cell_line][file] = True
        except:
            cleaned_header[cell_line] = default_values.copy()
            cleaned_header[cell_line][file] = True
    
    
print(
    pd.DataFrame(cleaned_header)
    .transpose()
    .rename(columns={
#         'COLO-1_ExactMatch_Gemini_SENSITIVE_LETHALITY_ALL.txt': 'GEMINI_lethality',
        'COLO-1_ExactMatch_Gemini_STRONG_ALL.txt': 'GEMINI_strong',
#         'COLO-1_ExactMatch_Gemini_SENSITIVE_RECOVERY_ALL.txt': 'GEMINI_recovery',
        'COLO-1_ExactMatch_LogFC_Cell-Line-Average_All.gene.stats.annotated.txt': 'LogFC'
    })
    .sort_values('LogFC')
    .to_markdown()
)

|           |   COLO-1_ExactMatch_GEMINI_STRONG_ALL.txt |   LogFC |
|:----------|------------------------------------------:|--------:|
| Gene      |                                         1 |       0 |
| SIDM00118 |                                         1 |       1 |
| SIDM00049 |                                         1 |       1 |
| SIDM00778 |                                         1 |       1 |
| SIDM00783 |                                         1 |       1 |
| SIDM00359 |                                         1 |       1 |
| SIDM00782 |                                         1 |       1 |
| SIDM00537 |                                         1 |       1 |
| SIDM00837 |                                         1 |       1 |
| SIDM00677 |                                         1 |       1 |
| SIDM00834 |                                         1 |       1 |
| SIDM00681 |                                         1 |       1 |
| SIDM00833 |                                   