In [None]:
### 

import pandas as pd
def combine_clinical_data_and_reference_data(reference, clinical):
    """
    This function compare reference data with clinical data, to match variants with a disease identified

    Parameter: 
    reference : the file with the variants of YOUR dataset
    clinical : the online database with the clinical data.

    Returns: 
    a dataframe that combine the 2 above parameters.
    """
    # Read the CSV files
    df_clinical = pd.read_csv(clinical)
    df_REF = reference

    # Initialize list for selected indices
    selected_indices = []

    # Add 'Position' column to df_REF
    df_REF['Position'] = df_REF['POS'] 

    # Loop through each position in df_REF
    for pos in df_REF['Position']:
        # Find matching indices in df_annotated
        matching_indices = df_clinical.index[df_clinical['Position'] == pos].tolist()
        if matching_indices:
            selected_indices.append(matching_indices)

    # Flatten the list of indices
    flat_indices = [index[0] for index in selected_indices]

    # Filter df_annotated to keep only rows with the selected indices
    df_annotated_filtered = df_clinical.loc[flat_indices]

    # Correct way to select specific columns
    df_annotated_filtered = df_annotated_filtered[['Position', 'Disease']]

    df_merged = pd.merge(df_REF, df_annotated_filtered, on='Position', how='left')
    df_merged = df_merged.drop(['Position'], axis=1)
    return df_merged

def clean_headers_of_df(filepath, file_id):
    """
    Clean the headers of a VCF file.
    """
    df_vcf = pd.read_csv(filepath, sep='\t')
    # Split the fileID column by all colons and expand into the eight columns
    df_vcf[file_id] = df_vcf[file_id].str.strip()
    df_vcf["Sample"] = file_id
    df_vcf[['GT', 'AD', 'AF', 'DP', 'F1R2', 'F2R1', 'FAD', 'SB']] = df_vcf[file_id].str.split(':', expand=True, n=7)
    df_vcf = df_vcf.drop(['ID','QUAL','GT', 'AD', 'DP', 'F1R2', 'F2R1', 'FAD', 'SB', file_id, 'FORMAT', 'INFO'], axis=1)
    # print(df_vcf)
    return df_vcf



def adding_variant_format(df, position):
    # Create the 'Mutation' column by concatenating 'REF', 'POS', and 'ALT'
    df['Mutation'] = df['REF'] + df['POS'].astype(str) +  df['ALT']
    
    # Insert the 'Mutation' column at the specified position
    df.insert(position, 'Mutation', df.pop('Mutation'))  # Remove and insert at the new position
    
    return df

import re
import numpy as np

def split_column(df, column_name, text_col_name, number_col_name):    
    # Apply a lambda function to split the values in the specified column
    df[[text_col_name, number_col_name]] = df[column_name].apply(
        lambda value: re.match(r'([a-zA-Z_]+)\((\d+\.?\d*)\)', str(value)).groups() if pd.notnull(value) and re.match(r'([a-zA-Z_]+)\((\d+\.?\d*)\)', str(value)) else (value, None)
    ).apply(pd.Series)
    
    # Convert the number column to float type
    df[number_col_name] = pd.to_numeric(df[number_col_name], errors='coerce')
    
    # Remove the original column
    df.drop(columns=[column_name], inplace=True)
    
    return df

## Data Analysis of GATK data: 
import glob
import os

def VCF_table_converter(folder_path, output):
    for filepath in glob.glob(os.path.join(folder_path, "*_markdup_filtered.vcf")): # Adjust the pattern if needed
        # Process each file (filepath is the full path to the file)
        with open(filepath, 'r') as file: 
            lines = file.readlines()
        with open(filepath, 'w') as wfile:
            for line in lines:
                if not line.startswith("##"):
                    wfile.write(line)
        
        filename = os.path.basename(filepath)
        file_id = filename.split('_markdup_filtered.vcf')[0]
        print(file_id)

        df_vcf = clean_headers_of_df(filepath)
        df_vcf = adding_variant_format(df_vcf, 1)

        # Combining the Clinical data with the reference
        df_vcf = combine_clinical_data_and_reference_data(df_vcf,"MutationsCodingControl_MITOMAP_Foswiki.csv")
        # print(df_vcf)
        
        file_path_to_VEP = "Output/VEP_txt_files/lofreq_txt"
        VEP_df = pd.read_csv(f"{file_path_to_VEP}/{file_id}.txt", sep='\t', header=0)
        # print(VEP_df)
        VEP_df = VEP_df[['IMPACT', 'Consequence', 'SYMBOL', 'Gene', 'Feature', 'BIOTYPE', 
                         'Existing_variation', 'cDNA_position', 'CDS_position','Protein_position', 'Amino_acids', 'Codons', 'SIFT', 'PolyPhen', 'CLIN_SIG' ]]
        VEP_df = split_column(VEP_df, 'SIFT', 'SIFT_', 'SIFT_value')
        VEP_df = split_column(VEP_df, 'PolyPhen', 'PolyPhen_', 'PolyPhen_value')
        VEP_df = VEP_df.reset_index(drop=True)
        df_vcf = df_vcf.reset_index(drop=True)


        df_vcf = df_vcf.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
        VEP_df = VEP_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
        # print(VEP_df)
        # print(df_vcf)
        df_vcf = pd.concat([df_vcf, VEP_df], axis=1)
        print(df_vcf)
        # print out all the tsv files as seperate files in specified folders
        df_vcf.to_csv(f'{output}/{file_id}.tsv', sep='\t', index=False)

In [None]:
def clean_headers_of_df_lofreq(filepath):
    """
    Clean the headers of a VCF file.
    """
    df_vcf = pd.read_csv(filepath, sep='\t')
    file_id = 'INFO'
    # Split the fileID column by all colons and expand into the eight columns
    df_vcf[file_id] = df_vcf[file_id].str.strip()
    df_vcf[['DP', 'AF', 'SB', 'DB4']] = df_vcf[file_id].str.split(';', expand=True, n=7)
    df_vcf = df_vcf.drop(['ID', 'DP', 'SB', 'DB4', 'INFO'], axis=1)
    # print(df_vcf)
    return df_vcf


## Data Analysis of GATK data: 
import glob
import os

def VCF_table_converter_lofreq(folder_path, output):
    for filepath in glob.glob(os.path.join(folder_path, "*_markdup_filtered.vcf")): # Adjust the pattern if needed
        # Process each file (filepath is the full path to the file)
        # print(f"Processing file: {filepath}")
        with open(filepath, 'r') as file: 
            lines = file.readlines()
        with open(filepath, 'w') as wfile:
            for line in lines:
                if not line.startswith("##"):
                    wfile.write(line)
        
        filename = os.path.basename(filepath)
        file_id = filename.split('_markdup_filtered.vcf')[0]
        print(file_id)

        df_vcf = clean_headers_of_df_lofreq(filepath)
        df_vcf = adding_variant_format(df_vcf, 1)

        # Combining the Clinical data with the reference
        df_vcf = combine_clinical_data_and_reference_data(df_vcf,"MutationsCodingControl_MITOMAP_Foswiki.csv")
        # print(df_vcf)
        
        file_path_to_VEP = "Output/VEP_txt_files/lofreq_txt"
        VEP_df = pd.read_csv(f"{file_path_to_VEP}/{file_id}.txt", sep='\t', header=0)
        # print(VEP_df)
        VEP_df = VEP_df[['IMPACT', 'Consequence', 'SYMBOL', 'Gene', 'Feature', 'BIOTYPE', 
                         'Existing_variation', 'cDNA_position', 'CDS_position','Protein_position', 'Amino_acids', 'Codons', 'SIFT', 'PolyPhen', 'CLIN_SIG' ]]
        VEP_df = split_column(VEP_df, 'SIFT', 'SIFT_', 'SIFT_value')
        VEP_df = split_column(VEP_df, 'PolyPhen', 'PolyPhen_', 'PolyPhen_value')
        VEP_df = VEP_df.reset_index(drop=True)
        df_vcf = df_vcf.reset_index(drop=True)


        df_vcf = df_vcf.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
        VEP_df = VEP_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
        df_vcf = pd.concat([df_vcf, VEP_df], axis=1)

        # print out all the tsv files as seperate files in specified folders
        df_vcf.to_csv(f'{output}/{file_id}.tsv', sep='\t', index=False)