In [3]:
pip install pandas numpy scikit-learn matplotlib scipy

Collecting pandas
  Downloading pandas-2.2.2-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp39-cp39-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB 495.5 kB/s eta 0:00:24
   ---------------------------------------- 0.1/11.6 MB 660.6 kB/s eta 0:00:18
   ---------------------------------------- 0.1/11.6 MB 660.6 kB/s eta 0:00:18
   ---------------------------------------- 0.1/11.6 MB 660.6 kB/s eta 0:00:18
   ---------------------------------------- 0.1/11.6 MB 660.6 kB/s eta 0:00:18
   ---------------------------------------- 0.1/11.6 MB 660.6 kB/s eta 0:00:18
   ---------------------------------------- 0.1/11.6 MB 426.7 kB/s eta 0:00:27
    --------------------------------------- 0.2/11.6 MB

In [2]:
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split

normal_csv = r"C:\Users\Badis\Desktop\normaldata.csv"
pathogenic_csv = r"C:\Users\Badis\Desktop\pathodatas.csv"
output_folder = 'FCGR_Analysis_k4'

splits = ['train', 'validation', 'test']
classes = ['normal', 'pathogenic']
for split in splits:
    for cls in classes:
        os.makedirs(os.path.join(output_folder, split, cls), exist_ok=True)

normal_df = pd.read_csv(normal_csv, dtype=str)
pathogenic_df = pd.read_csv(pathogenic_csv, dtype=str)

normal_df['segment'] = normal_df['segment'].fillna('').astype(str)
pathogenic_df['Mutated_seg'] = pathogenic_df['Mutated_seg'].fillna('').astype(str)

def extract_kmer_frequencies(sequence, k=4):
    if isinstance(sequence, str):
        kmer_counts = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
        return {kmer: count for kmer, count in kmer_counts.items() if set(kmer).issubset({'A', 'C', 'G', 'T'})}
    else:
        return Counter()

normal_df['kmer_frequencies'] = normal_df['segment'].apply(lambda seq: extract_kmer_frequencies(seq, k=4))
pathogenic_df['kmer_frequencies'] = pathogenic_df['Mutated_seg'].apply(lambda seq: extract_kmer_frequencies(seq, k=4))

nucleotide_to_int = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

def kmer_to_fcgr(frequencies, k=4):
    size = 2**k
    fcgr = np.zeros((size, size))
    
    for kmer, freq in frequencies.items():
        index = sum([nucleotide_to_int[char] * (4**i) for i, char in enumerate(reversed(kmer))])
        x, y = divmod(index, size)
        fcgr[x, y] = freq
    
    return fcgr

def save_fcgr_images(df, folder, prefix, k=4):
    for i, freqs in enumerate(df['kmer_frequencies']):
        fcgr = kmer_to_fcgr(freqs, k=k)
        
        plt.figure(figsize=(6, 6))
        plt.imshow(fcgr, cmap='viridis')
        plt.axis('off')
        plt.savefig(os.path.join(output_folder, folder, f'{prefix}_{i+1}.png'), bbox_inches='tight', pad_inches=0)
        plt.close()

def split_data(df, split_ratios=(0.7, 0.2, 0.1), random_state=42):
    train_ratio, val_ratio, test_ratio = split_ratios
    train_df, temp_df = train_test_split(df, test_size=val_ratio + test_ratio, random_state=random_state)
    val_df, test_df = train_test_split(temp_df, test_size=test_ratio / (val_ratio + test_ratio), random_state=random_state)
    return train_df, val_df, test_df

normal_train_df, normal_val_df, normal_test_df = split_data(normal_df)

pathogenic_train_df, pathogenic_val_df, pathogenic_test_df = split_data(pathogenic_df)

save_fcgr_images(normal_train_df, 'train/normal', 'normal_train', k=4)
save_fcgr_images(pathogenic_train_df, 'train/pathogenic', 'pathogenic_train', k=4)

save_fcgr_images(normal_val_df, 'validation/normal', 'normal_validation', k=4)
save_fcgr_images(pathogenic_val_df, 'validation/pathogenic', 'pathogenic_validation', k=4)

save_fcgr_images(normal_test_df, 'test/normal', 'normal_test', k=4)
save_fcgr_images(pathogenic_test_df, 'test/pathogenic', 'pathogenic_test', k=4)

print("Analysis complete. Check the 'FCGR_Analysis_k4' folder for results.")


Analysis complete. Check the 'FCGR_Analysis_k4' folder for results.


In [3]:
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split


normal_csv = r"C:\Users\Badis\Desktop\normaldata.csv"
pathogenic_csv = r"C:\Users\Badis\Desktop\pathodatas.csv"
output_folder = 'FCGR_Analysis_k5'


splits = ['train', 'validation', 'test']
classes = ['normal', 'pathogenic']
for split in splits:
    for cls in classes:
        os.makedirs(os.path.join(output_folder, split, cls), exist_ok=True)


normal_df = pd.read_csv(normal_csv, dtype=str)
pathogenic_df = pd.read_csv(pathogenic_csv, dtype=str)


normal_df['segment'] = normal_df['segment'].fillna('').astype(str)
pathogenic_df['Mutated_seg'] = pathogenic_df['Mutated_seg'].fillna('').astype(str)

# Extract k-mer frequencies
def extract_kmer_frequencies(sequence, k=5):
    if isinstance(sequence, str):
        kmer_counts = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
        return {kmer: count for kmer, count in kmer_counts.items() if set(kmer).issubset({'A', 'C', 'G', 'T'})}
    else:
        return Counter()

normal_df['kmer_frequencies'] = normal_df['segment'].apply(lambda seq: extract_kmer_frequencies(seq, k=5))
pathogenic_df['kmer_frequencies'] = pathogenic_df['Mutated_seg'].apply(lambda seq: extract_kmer_frequencies(seq, k=5))

nucleotide_to_int = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

def kmer_to_fcgr(frequencies, k=5):
    size = 2**k
    fcgr = np.zeros((size, size))
    
    for kmer, freq in frequencies.items():
        index = sum([nucleotide_to_int[char] * (4**i) for i, char in enumerate(reversed(kmer))])
        x, y = divmod(index, size)
        fcgr[x, y] = freq
    
    return fcgr

def save_fcgr_images(df, folder, prefix, k=5):
    for i, freqs in enumerate(df['kmer_frequencies']):
        fcgr = kmer_to_fcgr(freqs, k=k)
        
        plt.figure(figsize=(6, 6))
        plt.imshow(fcgr, cmap='viridis')
        plt.axis('off')
        plt.savefig(os.path.join(output_folder, folder, f'{prefix}_{i+1}.png'), bbox_inches='tight', pad_inches=0)
        plt.close()

def split_data(df, split_ratios=(0.7, 0.2, 0.1), random_state=42):
    train_ratio, val_ratio, test_ratio = split_ratios
    train_df, temp_df = train_test_split(df, test_size=val_ratio + test_ratio, random_state=random_state)
    val_df, test_df = train_test_split(temp_df, test_size=test_ratio / (val_ratio + test_ratio), random_state=random_state)
    return train_df, val_df, test_df

normal_train_df, normal_val_df, normal_test_df = split_data(normal_df)
pathogenic_train_df, pathogenic_val_df, pathogenic_test_df = split_data(pathogenic_df)
save_fcgr_images(normal_train_df, 'train/normal', 'normal_train', k=5)
save_fcgr_images(pathogenic_train_df, 'train/pathogenic', 'pathogenic_train', k=5)

save_fcgr_images(normal_val_df, 'validation/normal', 'normal_validation', k=5)
save_fcgr_images(pathogenic_val_df, 'validation/pathogenic', 'pathogenic_validation', k=5)

save_fcgr_images(normal_test_df, 'test/normal', 'normal_test', k=5)
save_fcgr_images(pathogenic_test_df, 'test/pathogenic', 'pathogenic_test', k=5)

print("Analysis complete for k=5. Check the 'FCGR_Analysis_k5' folder for results.")


Analysis complete for k=5. Check the 'FCGR_Analysis_k5' folder for results.


In [4]:
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split

normal_csv = r"C:\Users\Badis\Desktop\normaldata.csv"
pathogenic_csv = r"C:\Users\Badis\Desktop\pathodatas.csv"
output_folder = 'FCGR_Analysis_k6'

splits = ['train', 'validation', 'test']
classes = ['normal', 'pathogenic']
for split in splits:
    for cls in classes:
        os.makedirs(os.path.join(output_folder, split, cls), exist_ok=True)

normal_df = pd.read_csv(normal_csv, dtype=str)
pathogenic_df = pd.read_csv(pathogenic_csv, dtype=str)

normal_df['segment'] = normal_df['segment'].fillna('').astype(str)
pathogenic_df['Mutated_seg'] = pathogenic_df['Mutated_seg'].fillna('').astype(str)

def extract_kmer_frequencies(sequence, k=6):
    if isinstance(sequence, str):
        kmer_counts = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
        return {kmer: count for kmer, count in kmer_counts.items() if set(kmer).issubset({'A', 'C', 'G', 'T'})}
    else:
        return Counter()

normal_df['kmer_frequencies'] = normal_df['segment'].apply(lambda seq: extract_kmer_frequencies(seq, k=6))
pathogenic_df['kmer_frequencies'] = pathogenic_df['Mutated_seg'].apply(lambda seq: extract_kmer_frequencies(seq, k=6))

nucleotide_to_int = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

def kmer_to_fcgr(frequencies, k=6):
    size = 2**k
    fcgr = np.zeros((size, size))
    
    for kmer, freq in frequencies.items():
        index = sum([nucleotide_to_int[char] * (4**i) for i, char in enumerate(reversed(kmer))])
        x, y = divmod(index, size)
        fcgr[x, y] = freq
    
    return fcgr

def save_fcgr_images(df, folder, prefix, k=6):
    for i, freqs in enumerate(df['kmer_frequencies']):
        fcgr = kmer_to_fcgr(freqs, k=k)
        
        plt.figure(figsize=(6, 6))
        plt.imshow(fcgr, cmap='viridis')
        plt.axis('off')
        plt.savefig(os.path.join(output_folder, folder, f'{prefix}_{i+1}.png'), bbox_inches='tight', pad_inches=0)
        plt.close()

def split_data(df, split_ratios=(0.7, 0.2, 0.1), random_state=42):
    train_ratio, val_ratio, test_ratio = split_ratios
    train_df, temp_df = train_test_split(df, test_size=val_ratio + test_ratio, random_state=random_state)
    val_df, test_df = train_test_split(temp_df, test_size=test_ratio / (val_ratio + test_ratio), random_state=random_state)
    return train_df, val_df, test_df

normal_train_df, normal_val_df, normal_test_df = split_data(normal_df)

pathogenic_train_df, pathogenic_val_df, pathogenic_test_df = split_data(pathogenic_df)

save_fcgr_images(normal_train_df, 'train/normal', 'normal_train', k=6)
save_fcgr_images(pathogenic_train_df, 'train/pathogenic', 'pathogenic_train', k=6)

save_fcgr_images(normal_val_df, 'validation/normal', 'normal_validation', k=6)
save_fcgr_images(pathogenic_val_df, 'validation/pathogenic', 'pathogenic_validation', k=6)

save_fcgr_images(normal_test_df, 'test/normal', 'normal_test', k=6)
save_fcgr_images(pathogenic_test_df, 'test/pathogenic', 'pathogenic_test', k=6)

print("Analysis complete for k=6. Check the 'FCGR_Analysis_k6' folder for results.")


Analysis complete for k=6. Check the 'FCGR_Analysis_k6' folder for results.
