In [1]:
import os
os.chdir("../")
%pwd

'E:\\Masters_thesis'

In [9]:
import pandas as pd
import yaml

In [12]:
yaml_file_path = "config.yaml"

with open(yaml_file_path, "r") as file:
    config_data = yaml.safe_load(file)
    
print(config_data['nsp1_config']['LENGTH'])
print(config_data['nsp1_config']['PATH'])
print(config_data['ROOT']['DATA_FOLDER'])
print(config_data['ROOT']['DATA_DIR'])

540
NSP1_ger_all_nuc_seqs.csv
data
raw data


# clean the raw data in following way:

1. For each individual protein, keep sequences of its particular length and discard all the other variant length sequences
2. Look for sequencing errors like presence of N or D or various other alphabets other than A, T, G, and C
3. Merge the sequences using meta data

# Step 1: 
    
##    - Iterate over each individual protein.
##    - For each protein, keep only the sequences with a particular length.
##    - Discard all variant length sequences associated with the protein.
##    - Repeat this process for each protein in your dataset.

In [56]:
### The below code iterates through all proteins and provides the number of rows that have sequence length mismatch

DATA_FOLDER = config_data['ROOT']['DATA_FOLDER']
DATA_DIR = config_data['ROOT']['DATA_DIR']
NEW_DIR = "step1_filtered"
for protein_key, protein_config in config_data.items():

    if protein_key == "ROOT":
        continue
    
    protein_name = protein_key.replace('_config', '')
    protein_csv_path = protein_config['PATH']
    protein_length = protein_config['LENGTH']
    
    df = pd.read_csv(os.path.join(DATA_FOLDER, DATA_DIR, protein_csv_path))
    
    length_condition = df['Sequence'].apply(lambda x: len(str(x))) == protein_length
    
    filtered_df = df[length_condition]
    
    print(f"NSP: {protein_name}")
    print(f"------ Original number of rows: {len(df)}")
    print(f"------ After Preprocessing: {len(filtered_df)}")
    print(f"------ Length of each sequence: {protein_length}")
    print(f"------ Total rows discarded: {len(df) - len(filtered_df)}")
    
    
    # Save the filtered DataFrame to a new CSV file
    os.makedirs(os.path.join(DATA_FOLDER, NEW_DIR), exist_ok=True)
    
    output_csv_path = os.path.join(DATA_FOLDER, NEW_DIR, f"{protein_name}_step1_filtered.csv")
    filtered_df.to_csv(output_csv_path, index=False)
    print(f"------ Filtered data saved to: {output_csv_path}")
    
    print("*"*100)
    
#     print(protein_key)
#     print(protein_config)

NSP: nsp1
------ Original number of rows: 490428
------ After Preprocessing: 477904
------ Length of each sequence: 540
------ Total rows discarded: 12524
------ Filtered data saved to: data\step1_filtered\nsp1_step1_filtered.csv
****************************************************************************************************
NSP: nsp2
------ Original number of rows: 490465
------ After Preprocessing: 490152
------ Length of each sequence: 1914
------ Total rows discarded: 313
------ Filtered data saved to: data\step1_filtered\nsp2_step1_filtered.csv
****************************************************************************************************
NSP: nsp3
------ Original number of rows: 490465
------ After Preprocessing: 392462
------ Length of each sequence: 5835
------ Total rows discarded: 98003
------ Filtered data saved to: data\step1_filtered\nsp3_step1_filtered.csv
****************************************************************************************************
NSP: ns

NSP: ns9b
------ Original number of rows: 490464
------ After Preprocessing: 352662
------ Length of each sequence: 294
------ Total rows discarded: 137802
------ Filtered data saved to: data\step1_filtered\ns9b_step1_filtered.csv
****************************************************************************************************
NSP: ns9c
------ Original number of rows: 490461
------ After Preprocessing: 489639
------ Length of each sequence: 222
------ Total rows discarded: 822
------ Filtered data saved to: data\step1_filtered\ns9c_step1_filtered.csv
****************************************************************************************************


# Step 2:
##  - Check for sequencing errors (N, D, or other non-standard alphabets) in each sequence.  

In [96]:
DATA_FOLDER = config_data['ROOT']['DATA_FOLDER']
STEP1_DIR = "step1_filtered"
NEW_DIR = "step2_filtered"

In [97]:
valid_alphabets = set('ATGC')

for protein_key, protein_config in config_data.items():

    if protein_key == "ROOT":
        continue
    
    protein_name = protein_key.replace('_config', '')
    protein_csv_path_step1 = os.path.join(DATA_FOLDER, STEP1_DIR, f"{protein_name}_step1_filtered.csv")

    if not os.path.exists(protein_csv_path_step1):
        print(f"Skipping {protein_name} as step1 filtered file doesn't exist.")
        continue
    
    df_step1 = pd.read_csv(protein_csv_path_step1)
    
    # Check for sequencing errors
    alphabet_condition = df_step1['Sequence'].apply(lambda x: set(str(x)).issubset(valid_alphabets))
    final_filtered_df = df_step1[alphabet_condition]
    
    print(f"NSP: {protein_name}")
    print(f"------ Original number of rows: {len(df_step1)}")
    print(f"------ After Sequencing Error Filtering: {len(final_filtered_df)}")
    print(f"------ Total rows discarded due to sequencing errors: {len(df_step1) - len(final_filtered_df)}")
    
    # Save the final filtered DataFrame to a new CSV file in step2_filtered folder
    os.makedirs(os.path.join(DATA_FOLDER, NEW_DIR), exist_ok=True)
    
    output_csv_path = os.path.join(DATA_FOLDER, NEW_DIR, f"{protein_name}_step2_filtered.csv")
    final_filtered_df.to_csv(output_csv_path, index=False)
    print(f"------ Final Filtered data saved to: {output_csv_path}")
    
    print("*"*100) 

NSP: nsp1
------ Original number of rows: 477904
------ After Sequencing Error Filtering: 461110
------ Total rows discarded due to sequencing errors: 16794
------ Final Filtered data saved to: data\step2_filtered\nsp1_step2_filtered.csv
****************************************************************************************************
NSP: nsp2
------ Original number of rows: 490152
------ After Sequencing Error Filtering: 443016
------ Total rows discarded due to sequencing errors: 47136
------ Final Filtered data saved to: data\step2_filtered\nsp2_step2_filtered.csv
****************************************************************************************************
NSP: nsp3
------ Original number of rows: 392462
------ After Sequencing Error Filtering: 272730
------ Total rows discarded due to sequencing errors: 119732
------ Final Filtered data saved to: data\step2_filtered\nsp3_step2_filtered.csv
***********************************************************************************

------ Final Filtered data saved to: data\step2_filtered\n_step2_filtered.csv
****************************************************************************************************
NSP: ns9b
------ Original number of rows: 352662
------ After Sequencing Error Filtering: 309166
------ Total rows discarded due to sequencing errors: 43496
------ Final Filtered data saved to: data\step2_filtered\ns9b_step2_filtered.csv
****************************************************************************************************
NSP: ns9c
------ Original number of rows: 489639
------ After Sequencing Error Filtering: 471552
------ Total rows discarded due to sequencing errors: 18087
------ Final Filtered data saved to: data\step2_filtered\ns9c_step2_filtered.csv
****************************************************************************************************
