In [1]:
import pandas as pd
import numpy as np


df = pd.read_csv('input.csv', sep=',')

if df.isnull().sum().sum() > 0:
    df.fillna(0, inplace=True)


if 'IDENTIFIER' in df.columns:
    df.set_index('IDENTIFIER', inplace=True)
    

    df_log = df.applymap(lambda x: np.log(x) if x > 0 else x)
    

    df_log.to_csv('./process document/log_data.csv')
else:
    print("Column 'IDENTIFIER' not found in the dataframe.")





  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Read CSV file
df_log = pd.read_csv('./process document/log_data.csv', index_col='IDENTIFIER')

# Determine global min and max values
min_val = df_log.min().min()
max_val = df_log.max().max()
print(max_val - min_val)
out_of_range_low = df_log[df_log.lt(min_val).any(axis=1)]
out_of_range_high = df_log[df_log.gt(max_val).any(axis=1)]
print("Data points below min_val:\n", out_of_range_low)
print("Data points above max_val:\n", out_of_range_high)

# Discretization parameters
n = 50
interval_width = (max_val - min_val) / n
bins = [min_val + i * interval_width for i in range(n + 1)]
labels = range(1, n + 1)

# Define discretization function (handling floating-point precision)
epsilon = 1e-9

def discretize_value_v2(value):
    if value != 0:
        for i in range(n):
            # Adjust for closed intervals with floating-point precision
            if bins[i] - epsilon <= value <= bins[i + 1] + epsilon:
                return labels[i]
        return np.nan  # Return NaN if no interval matches
    else:
        return value

# Process data with progress bar
discretized_df = pd.DataFrame(index=df_log.index)

for col in tqdm(df_log.columns, desc="Processing columns"):
    # Apply updated discretization function
    discretized_df[col] = df_log[col].map(discretize_value_v2)

# Display discretization intervals
intervals = [f"From {bins[i]:.2f} to {bins[i + 1]:.2f} -> {labels[i]}" for i in range(n)]
print("Discretization intervals:", intervals)

# Check for NaN values after discretization
nan_values = discretized_df.isnull().sum().sum()
print("NaN count after discretization:", nan_values)

# Save discretized data to new CSV file
discretized_df.to_csv('./process document/discretized_data.csv')

Processing columns: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1006.42it/s]

71.45748452703425
Data points below min_val:
 Empty DataFrame
Columns: [A, A.1, A.2, A.3, A.4, A.5, A.6, A.7, A.8, A.9, A.10, A.11, A.12, A.13, A.14, A.15, A.16, A.17, A.18, A.19, A.20, A.21, A.22, A.23, A.24, A.25, A.26, A.27, A.28, A.29, A.30, A.31, A.32, A.33, A.34, A.35, A.36, A.37, A.38, A.39, A.40, A.41, A.42, A.43, A.44, A.45, A.46, A.47, A.48, A.49, B, B.1, B.2, B.3, B.4, B.5, B.6, B.7, B.8, B.9, B.10, B.11, B.12, B.13, B.14, B.15, B.16, B.17, B.18, B.19, B.20, B.21, B.22, B.23, B.24, B.25, B.26, B.27, B.28, B.29, B.30, B.31, B.32, B.33, B.34, B.35, B.36, B.37, B.38, B.39, B.40, B.41, B.42, B.43, B.44, B.45, B.46, B.47, B.48, B.49]
Index: []

[0 rows x 100 columns]
Data points above max_val:
 Empty DataFrame
Columns: [A, A.1, A.2, A.3, A.4, A.5, A.6, A.7, A.8, A.9, A.10, A.11, A.12, A.13, A.14, A.15, A.16, A.17, A.18, A.19, A.20, A.21, A.22, A.23, A.24, A.25, A.26, A.27, A.28, A.29, A.30, A.31, A.32, A.33, A.34, A.35, A.36, A.37, A.38, A.39, A.40, A.41, A.42, A.43, A.44, A.45, 




In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mutual_info_score
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor


df_discretized = pd.read_csv('./process document/discretized_data.csv', index_col='IDENTIFIER')


df_pairs =pd.read_csv('input.edges.txt', sep='\t')


def calculate_metrics(pair):
    gene_a = pair['PARTICIPANT_A']
    gene_b = pair['PARTICIPANT_B']
    

    if gene_a in df_discretized.index and gene_b in df_discretized.index:
        data_a = df_discretized.loc[gene_a]
        data_b = df_discretized.loc[gene_b]
        

        MI = mutual_info_score(data_a, data_b)
        

        values_a, counts_a = np.unique(data_a, return_counts=True)
        probs_a = counts_a / len(data_a)
        H_A = -np.sum(probs_a * np.log(probs_a))


        values_b, counts_b = np.unique(data_b, return_counts=True)
        probs_b = counts_b / len(data_b)
        H_B = -np.sum(probs_b * np.log(probs_b))
        

        SU = 2.0 * MI / (H_A + H_B)
        
        return gene_a, gene_b, MI, H_A, H_B, SU
    else:
        return gene_a, gene_b, np.nan, np.nan, np.nan, np.nan


with ThreadPoolExecutor(max_workers=12) as executor:
    results = list(tqdm(executor.map(calculate_metrics, df_pairs.to_dict('records')), total=len(df_pairs), desc="Calculating metrics"))


df_results = pd.DataFrame(results, columns=['PARTICIPANT_A', 'PARTICIPANT_B', 'MI', 'H_A', 'H_B', 'SU'])


df_results.to_csv('./process document/gene_pairs_analysis.csv', index=False)

Calculating metrics: 100%|██████████████████████████████████████████████████████████| 195/195 [00:00<00:00, 445.55it/s]


In [4]:
import pandas as pd


df = pd.read_csv('./process document/gene_pairs_analysis.csv')


df_sorted = df.sort_values(by='SU', ascending=False)


N = 0.1  
df_filtered = df_sorted[df_sorted['SU'] >= N]


df_filtered.to_csv('./process document/filtered_gene_pairs_analysis.csv', index=False)


In [5]:
import pandas as pd


df = pd.read_csv('./process document/filtered_gene_pairs_analysis.csv')


all_genes = pd.concat([df['PARTICIPANT_A'], df['PARTICIPANT_B']])
edges_count = all_genes.value_counts()


df_edges = pd.DataFrame({'Gene': edges_count.index, 'Edges': edges_count.values})


df_edges_sorted = df_edges.sort_values(by='Edges', ascending=False)


df_edges_sorted.to_csv('./process document/gene_edges_count_sorted.csv', index=False)


In [6]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


expr_df = pd.read_csv('input.csv', sep=',', index_col='IDENTIFIER')


pairs_df = pd.read_csv('./process document/filtered_gene_pairs_analysis.csv')


cosine_similarities = []


for _, row in pairs_df.iterrows():
    gene_a = row['PARTICIPANT_A']
    gene_b = row['PARTICIPANT_B']
    
 
    expr_a = expr_df.loc[gene_a].values.reshape(1, -1)
    expr_b = expr_df.loc[gene_b].values.reshape(1, -1)
    
   
    cosine_sim = cosine_similarity(expr_a, expr_b)[0][0]
    cosine_similarities.append(cosine_sim)


pairs_df['cov'] = cosine_similarities


pairs_df.to_csv('./process document/updated_gene_pairs.csv', index=False)

        

In [7]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  


gene_edges_df = pd.read_csv('./process document/gene_edges_count_sorted.csv')  
gene_pairs_df = pd.read_csv('./process document/updated_gene_pairs.csv')  


n = 0.1  # Threshold for filtering gene pairs
x = 100  # Maximum iteration limit
Q = 5    # Top Q gene pairs with highest cov selected each iteration
W = 6    # Maximum size limit for Gene Group B set

# Initialize output DataFrame
output_df = pd.DataFrame(columns=['Iteration', 'Gene_Group_B'])

# Create list of all genes
all_genes_list = gene_edges_df['Gene'].tolist()

def process_gene(gene_a):
    # Check if gene A can form valid Gene Group B
    potential_pairs = gene_pairs_df[(gene_pairs_df['PARTICIPANT_A'] == gene_a) | 
                                    (gene_pairs_df['PARTICIPANT_B'] == gene_a)]
    potential_high_cov_pairs = potential_pairs[potential_pairs['cov'] > n]
    
    # Return empty DataFrame if no valid pairs for gene A
    if potential_high_cov_pairs.empty:
        return pd.DataFrame(columns=['Iteration', 'Gene_Group_B'])
    
    # Initialize Gene Group B set for iterations
    gene_group_b = {gene_a}
    
    for _ in range(x):
        connected_pairs = gene_pairs_df[gene_pairs_df['PARTICIPANT_A'].isin(gene_group_b) | 
                                        gene_pairs_df['PARTICIPANT_B'].isin(gene_group_b)]
        num_connected_pairs = len(connected_pairs)
        
        # Select top Q highest cov pairs if exceeding threshold
        if num_connected_pairs > Q:
            connected_pairs = connected_pairs.sort_values(by='cov', ascending=False).head(Q)
        
        high_cov_pairs = connected_pairs[connected_pairs['cov'] > n]
        
        if high_cov_pairs.empty:
            break
        
        new_genes = set(high_cov_pairs['PARTICIPANT_A']).union(set(high_cov_pairs['PARTICIPANT_B']))
        gene_group_b.update(new_genes)
        
        # Stop if Gene Group B exceeds size limit
        if len(gene_group_b) > W:
            break
    
    return pd.DataFrame({'Iteration': [0], 'Gene_Group_B': [list(gene_group_b)]})

# Process all genes with multithreading and progress bar
with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(process_gene, all_genes_list), total=len(all_genes_list), desc="Processing Genes"))

# Combine all results into single DataFrame
output_df = pd.concat(results, ignore_index=True)

# Save output to CSV file
output_df.to_csv('./process document/gene_groups_output.csv', index=False)

# Save unselected genes to separate CSV file
remaining_genes_df = pd.DataFrame({'Remaining_Genes': [gene for gene in all_genes_list if gene not in output_df['Gene_Group_B'].explode().unique()]})
remaining_genes_df.to_csv('remaining_genes.csv', index=False)

Processing Genes: 100%|████████████████████████████████████████████████████████████████| 50/50 [00:07<00:00,  6.34it/s]


In [9]:
import pandas as pd

p =3


df = pd.read_csv('./process document/gene_groups_output.csv')


group_counts = {}


for index, row in df.iterrows():

    gene_group = eval(row['Gene_Group_B'])
    

    gene_count = len(gene_group)
    

    group_key = tuple(sorted(gene_group))
    if group_key in group_counts:
        group_counts[group_key] += gene_count
    else:
        group_counts[group_key] = gene_count


filtered_group_counts = {group: count for group, count in group_counts.items() if count >= p}


for group, count in filtered_group_counts.items():
    print(f"Group {group} has {count} gene(s).")


filtered_group_counts_df = pd.DataFrame(list(filtered_group_counts.items()), columns=['Genes', 'Gene Count'])
filtered_group_counts_df.to_csv('./process document/filtered_gene_group_gene_counts.csv', index=False)
    

Group (3, 4, 20, 23, 27, 35, 41) has 7 gene(s).
Group (27, 33, 35, 37, 41, 49) has 18 gene(s).
Group (23, 27, 33, 35, 37, 41, 49) has 7 gene(s).
Group (0, 2, 27, 33, 35, 37, 41, 49) has 8 gene(s).
Group (8, 13, 27, 28, 35, 37, 41, 49) has 8 gene(s).
Group (24, 27, 33, 35, 37, 41, 49) has 7 gene(s).
Group (0, 2, 8, 13, 26, 48) has 24 gene(s).
Group (3, 4, 23, 27, 35, 37, 41, 49) has 8 gene(s).
Group (3, 4, 23, 27, 35, 37, 41) has 7 gene(s).
Group (0, 2, 8, 13, 28, 35, 37, 41, 49) has 9 gene(s).
Group (20, 27, 33, 35, 37, 41, 49) has 7 gene(s).
Group (27, 28, 33, 35, 37, 41, 49) has 7 gene(s).
Group (3, 4, 20, 33, 35, 37, 41, 49) has 8 gene(s).
Group (25, 31, 36, 42, 45) has 25 gene(s).
Group (29, 32, 39, 43, 46) has 25 gene(s).
Group (6, 11, 15, 18, 22) has 25 gene(s).
Group (1, 9, 12, 16, 19) has 25 gene(s).
Group (5, 10, 14, 17, 21) has 25 gene(s).
Group (30, 34, 40, 44, 47) has 25 gene(s).


In [10]:
import pandas as pd
import ast

def process_data(input_file, output_file, remove_subsets=False):


    df = pd.read_csv(input_file)
    

    def split_genes(gene_string):
        try:
            gene_tuple = ast.literal_eval(gene_string)
            genes = tuple(str(gene).strip("()'\"") for gene in gene_tuple)  
            return genes
        except (SyntaxError, ValueError) as e:
            print(f"Warning: Could not parse '{gene_string}'. Error: {e}")
            return tuple()
    

    df['Genes'] = df['Genes'].apply(split_genes)
    

    if remove_subsets:
        rows_to_keep = []
        n = len(df)
        
        for i in range(n):
            is_subset = False
            genes_i = set(df.iloc[i]['Genes'])
            
            for j in range(n):
                if i != j: 
                    genes_j = set(df.iloc[j]['Genes'])
                    
                    if genes_i.issubset(genes_j) and genes_i != genes_j:
                        is_subset = True
                        break
            
            if not is_subset:
                rows_to_keep.append(i)
        

        df = df.iloc[rows_to_keep].reset_index(drop=True)
        print(f"Removed {n - len(rows_to_keep)} subset data rows")
    

    genes_df = pd.DataFrame(df['Genes'].tolist(), index=df.index)
    

    df = pd.concat([df.drop(columns=['Genes']), genes_df], axis=1)
    

    df.columns = ['Gene Count'] + list(range(genes_df.shape[1]))
    

    print(df.head())
    

    df.to_csv(output_file, index=False)
    print(f"Saved processed data to {output_file}")

# Example usage
if __name__ == "__main__":
    # Set remove_subsets=True to enable subset removal
    process_data('./process document/filtered_gene_group_gene_counts.csv', 
                 'output.csv', 
                 remove_subsets=True)  # Can be set to False to disable this feature

Removed 2 subset data rows
   Gene Count   0   1   2   3   4   5   6     7     8
0           7   3   4  20  23  27  35  41  None  None
1           7  23  27  33  35  37  41  49  None  None
2           8   0   2  27  33  35  37  41    49  None
3           8   8  13  27  28  35  37  41    49  None
4           7  24  27  33  35  37  41  49  None  None
Saved processed data to output.csv
