# Data preparation

In [68]:
import pandas as pd
import os

# Impute your path
work_dir =  os.getcwd()
directory = os.path.join(work_dir, 'output')
dfs = {}

# Download all files relevant to *_combined.tsv
for filename in os.listdir(directory):
    if filename.endswith('_combined.tsv'):
        sample_name = filename.split('_combined.tsv')[0]
        file_path = os.path.join(directory, filename)
        dfs[sample_name] = pd.read_csv(file_path, sep='\t', encoding='ISO-8859-1')

# Check DataFrames
for sample, df in dfs.items():
    print(f"DataFrame for {sample}:")
    print(df.head())

  dfs[sample_name] = pd.read_csv(file_path, sep='\t', encoding='ISO-8859-1')
  dfs[sample_name] = pd.read_csv(file_path, sep='\t', encoding='ISO-8859-1')
  dfs[sample_name] = pd.read_csv(file_path, sep='\t', encoding='ISO-8859-1')


DataFrame for POU538:
  CHROM     POS ID REF ALT  QUAL FILTER POU538_GT POU538_DP Allele  ...  \
0     1   69270  .   A   G   969   PASS         .         0    G,G  ...   
1     1   69511  .   A   G  3070   PASS         .         0      G  ...   
2     1   69761  .   A   T   478   PASS         .         0      T  ...   
3     1   69897  .   T   C   334   PASS         .         0      C  ...   
4     1  129285  .   G   A   134   PASS       1/1         7    A,A  ...   

  dgWES_AC_Het dgWES_AC_Hom dgWES_AC_Hemi                dgWES_HWE  \
0        27,27      924,924           0,0                      0,0   
1            8         1038             0                        0   
2           37           80             0              4.76133e-41   
3          177          520             0                        0   
4        66,66      118,118           0,0  1.54143e-44,1.54143e-44   

                  gnomADv3        gnomADv3_AF gnomADv3_AC_raw gnomADv3_AC_XY  \
0  rs201219564,rs201219564

The column CADD_phred needs to be converted to a numerical form so that it can filter

In [69]:
import numpy as np
import re

def extract_numbers(row):
    numbers = re.findall(r'\d+\.\d+', row)  #Find all numbers in row
    if numbers:
        return float(numbers[0])
    else:
        return np.nan

def fix_CADD_phred(df):
    df['CADD_phred'] = df['CADD_phred'].astype(str).apply(lambda x: extract_numbers(x))
    return df

for sample, df in dfs.items():
    dfs[sample] = fix_CADD_phred(df)

# Check the results
for sample, df in dfs.items():
    print(f"Processed CADD_phred for {sample}:")
    print(df['CADD_phred'].head())

Processed CADD_phred for POU538:
0       NaN
1     2.209
2    14.750
3       NaN
4       NaN
Name: CADD_phred, dtype: float64
Processed CADD_phred for QZU677:
0       NaN
1     2.209
2    14.750
3       NaN
4       NaN
Name: CADD_phred, dtype: float64
Processed CADD_phred for OQL728:
0       NaN
1     2.209
2    14.750
3       NaN
4       NaN
Name: CADD_phred, dtype: float64


The column gnomADv3_AF needs to be converted to a numerical form so that it can filter



In [70]:
def extract_numbers_to_null(row):
    numbers = re.findall(r'\d+\.?\d*(?:e-?\d+)?', row)  # Найти все числа в строке
    if numbers:
        return float(numbers[0])
    else:
        return 0

def fix_gnomADv3_AF(df):
    df['gnomADv3_AF'] = df['gnomADv3_AF'].astype(str).apply(lambda x: extract_numbers_to_null(x))
    return df

for sample, df in dfs.items():
    dfs[sample] = fix_gnomADv3_AF(df)

# Check the results
for sample, df in dfs.items():
    print(f"Processed gnomADv3_AF for {sample}:")
    print(df['gnomADv3_AF'].head())

Processed gnomADv3_AF for POU538:
0    0.629059
1    0.846001
2    0.060879
3    0.486394
4    0.665930
Name: gnomADv3_AF, dtype: float64
Processed gnomADv3_AF for QZU677:
0    0.629059
1    0.846001
2    0.060879
3    0.486394
4    0.665930
Name: gnomADv3_AF, dtype: float64
Processed gnomADv3_AF for OQL728:
0    0.629059
1    0.846001
2    0.060879
3    0.486394
4    0.665930
Name: gnomADv3_AF, dtype: float64


The letter values ​​corresponding to the tools for predicting pathogenic variants also need to be formatted.

In [71]:
def check_for_letters(value):
    if re.search(r'[a-zA-Z]', str(value)):
        return value
    else:
        return np.nan

columns = ['CLIN_SIG', 'DEOGEN2_pred', 'FATHMM_pred', 'LRT_pred', 'MetaSVM_pred', 'MutationTaster_pred', 'PROVEAN_pred', 'Polyphen2_HVAR_pred', 'PrimateAI_pred', 'SIFT_pred', 'CLINVAR_CLNSIG']


def fix_columns(df):
    df[columns] = df[columns].applymap(check_for_letters)
    return df

for sample, df in dfs.items():
    dfs[sample] = fix_columns(df)

# Check the results using the column FATHMM_pred as example
for sample, df in dfs.items():
    print(f"Processed FATHMM_pred for {sample}:")
    print(df['FATHMM_pred'].head())

  df[columns] = df[columns].applymap(check_for_letters)
  df[columns] = df[columns].applymap(check_for_letters)
  df[columns] = df[columns].applymap(check_for_letters)


Processed FATHMM_pred for POU538:
0    NaN
1    .&T
2    .&T
3    NaN
4    NaN
Name: FATHMM_pred, dtype: object
Processed FATHMM_pred for QZU677:
0    NaN
1    .&T
2    .&T
3    NaN
4    NaN
Name: FATHMM_pred, dtype: object
Processed FATHMM_pred for OQL728:
0    NaN
1    .&T
2    .&T
3    NaN
4    NaN
Name: FATHMM_pred, dtype: object


# Filtration

In [72]:
def gnomADv3_01_filtr(df):
    df = df.replace('.', np.nan)
    df.iloc[:, 8] = df.iloc[:, 8].astype(float)
    df = df[(df.iloc[:, 7] != np.NaN) & (df.iloc[:, 7] != '0/0') & (df['FILTER'] == 'PASS') & (df.iloc[:, 8] > 50) & (df['gnomADv3_AF'] <= 0.01)]
    return df

for sample, df in dfs.items():
    dfs[sample] = gnomADv3_01_filtr(df)

# Check the results
for sample, df in dfs.items():
    print(f"Filtered {sample} to a dataframe with the number of rows equal to:")
    print(len(df))

Filtered POU538 to a dataframe with the number of rows equal to:
2366
Filtered QZU677 to a dataframe with the number of rows equal to:
2605
Filtered OQL728 to a dataframe with the number of rows equal to:
2532


Further filtering based on the pathogenicity criterion obtained from different tools - a variant was included in the sample if more than half of the tools had a pathogenicity prediction

In [75]:
thresholds = {
    'CADD_phred': 20
}


columns = ['CADD_phred', 'CLIN_SIG', 'CLINVAR_CLNSIG', 'DEOGEN2_pred', 'FATHMM_pred',
           'LRT_pred', 'MetaSVM_pred', 'MutationTaster_pred', 'PROVEAN_pred',
           'Polyphen2_HVAR_pred', 'PrimateAI_pred', 'SIFT_pred']


def is_pathogenic(row):
    pathogenic_count = 0
    count = 0

    if not pd.isna(row['CADD_phred']):
        if row['CADD_phred'] >= thresholds['CADD_phred']:
            pathogenic_count += 1
        count += 1

    if not pd.isna(row['CLIN_SIG']):
        if 'pathogenic' in row['CLIN_SIG'] or 'likely_pathogenic' in row['CLIN_SIG']:
            pathogenic_count += 1
        count += 1

    if not pd.isna(row['CLINVAR_CLNSIG']):
        if 'Pathogenic' in row['CLINVAR_CLNSIG'] or 'Likely_pathogenic' in row['CLINVAR_CLNSIG']:
            pathogenic_count += 1
        count += 1

    for col in columns[3:]:
        if not pd.isna(row[col]):
            if 'D' in str(row[col]):
                pathogenic_count += 1
            count += 1

    return pathogenic_count > (count / 2)


for sample, df in dfs.items():
    df['pathogenic'] = df.apply(is_pathogenic, axis=1)
    dfs[sample] = df

# Check the results
for sample, df in dfs.items():
    print(f"Filtered DataFrame for {sample} has {len(df)} rows.")
    print(df['pathogenic'].head())

Filtered DataFrame for POU538 has 1824 rows.
404    False
479    False
761    False
865    False
908    False
Name: pathogenic, dtype: bool
Filtered DataFrame for QZU677 has 1936 rows.
294    False
457    False
458    False
463    False
601    False
Name: pathogenic, dtype: bool
Filtered DataFrame for OQL728 has 1900 rows.
142    False
347    False
431    False
550     True
602     True
Name: pathogenic, dtype: bool


In [76]:
for sample, df in dfs.items():
    dfs[sample] = df[df['pathogenic'] == True]

# Check the results
for sample, df in dfs.items():
    print(f"Filtered DataFrame for {sample} has {len(df)} rows.")
    print(df['pathogenic'].head())

Filtered DataFrame for POU538 has 68 rows.
15440    True
33272    True
42026    True
43562    True
44581    True
Name: pathogenic, dtype: bool
Filtered DataFrame for QZU677 has 92 rows.
5696     True
7287     True
8182     True
16607    True
19385    True
Name: pathogenic, dtype: bool
Filtered DataFrame for OQL728 has 88 rows.
550     True
602     True
672     True
8652    True
9684    True
Name: pathogenic, dtype: bool


In [77]:
for sample, df in dfs.items():
    df.to_csv(f'{sample}_pathogenic.csv', index=False)