In [2]:
import numpy as np
import random

def generate_initial_clusters(gene_indices, n_clusters, split_ratio=0.0, random_seed=None):
    # Initialize random seed if provided
    if random_seed is not None:
        random.seed(random_seed)

    # Handle empty gene list
    if not gene_indices:
        return [], []

    fixed_cluster = []
    remaining_genes = list(gene_indices)

    # Create fixed cluster if split ratio is specified
    if 0 < split_ratio < 1:
        num_fixed_genes = int(len(gene_indices) * split_ratio)
        if num_fixed_genes > 0:
            fixed_cluster_genes = random.sample(gene_indices, num_fixed_genes)
            fixed_cluster = fixed_cluster_genes
            remaining_genes = [gene for gene in gene_indices if gene not in fixed_cluster_genes]

    # Distribute remaining genes into clusters
    remaining_clusters = [[] for _ in range(n_clusters)]
    current_cluster = 0
    for node in remaining_genes:
        remaining_clusters[current_cluster].append(node)
        current_cluster = (current_cluster + 1) % n_clusters

    # Remove empty clusters
    remaining_clusters = [cluster for cluster in remaining_clusters if cluster]

    return remaining_clusters, fixed_cluster

def generate_edges_from_clusters(clusters, intra_cluster_prob, random_seed=None):
    # Initialize random seed if provided
    if random_seed is not None:
        random.seed(random_seed)

    edges = set()
    for cluster in clusters:
        for i in range(len(cluster)):
            for j in range(i + 1, len(cluster)):
                if random.random() < intra_cluster_prob:
                    edges.add(tuple(sorted((cluster[i], cluster[j]))))
    return sorted(list(edges))

# Parameter settings
n_genes = 50  
n_genes_group1 = 25  
n_genes_group2 = 25  
n_relevant_features = 50  
n_clusters_per_group = 3  
split_ratio = 0.4  
intra_cluster_prob_group1_regular = 0.9 
intra_cluster_prob_group2_regular = 0.9 
intra_cluster_prob_fixed_merged = 0.7 

random_seed_group1 = 42  
random_seed_group2 = 123  

# Gene indices for both groups
gene_indices_group1 = list(range(n_genes_group1))
gene_indices_group2 = list(range(n_genes_group1, n_genes))

# Generate clusters for group 1
remaining_clusters_group1, fixed_cluster_group1 = generate_initial_clusters(
    gene_indices_group1,  
    n_clusters_per_group, 
    split_ratio,         
    random_seed_group1   
)

# Generate clusters for group 2
remaining_clusters_group2, fixed_cluster_group2 = generate_initial_clusters(
    gene_indices_group2, 
    n_clusters_per_group,  
    split_ratio,         
    random_seed_group2  
)

# Merge fixed clusters
fixed_cluster_merged = sorted(fixed_cluster_group1 + fixed_cluster_group2)

# Generate edges for merged fixed cluster
edges_fixed_merged = []
if fixed_cluster_merged:
    edges_fixed_merged = generate_edges_from_clusters([fixed_cluster_merged], intra_cluster_prob_fixed_merged, random_seed=min(random_seed_group1, random_seed_group2))

# Generate edges for remaining clusters in group 1
edges_group1_remaining = generate_edges_from_clusters(remaining_clusters_group1, intra_cluster_prob_group1_regular, random_seed_group1)

# Generate edges for remaining clusters in group 2
edges_group2_remaining = generate_edges_from_clusters(remaining_clusters_group2, intra_cluster_prob_group2_regular, random_seed_group2)

# Combine all edges
edges_combined = sorted(list(set(edges_group1_remaining + edges_group2_remaining + edges_fixed_merged)))

# Save combined edges to file
edges_output_filename = "edges_combined.txt"
with open(edges_output_filename, 'w') as f:
    for edge in edges_combined:
        f.write(f"{edge[0]}\t{edge[1]}\n")
print(f"\nEdges saved to: {edges_output_filename}")

# Save merged fixed cluster to file
if fixed_cluster_merged:
    fixed_cluster_output_filename = "fixed_cluster_merged.txt"
    with open(fixed_cluster_output_filename, 'w') as f:
        f.write('\t'.join(map(str, fixed_cluster_merged)) + '\n')
    print(f"Fixed cluster saved to: {fixed_cluster_output_filename}")
else:
    print("Error: Fixed cluster is empty")

# Save group1 remaining clusters to file
clusters_group1_remaining_output_filename = "clusters_group1_remaining.txt"
with open(clusters_group1_remaining_output_filename, 'w') as f:
    for cluster in remaining_clusters_group1:
        f.write('\t'.join(map(str, cluster)) + '\n')
print(f"Group1 remaining clusters saved to: {clusters_group1_remaining_output_filename}")

# Save group2 remaining clusters to file
clusters_group2_remaining_output_filename = "clusters_group2_remaining.txt"
with open(clusters_group2_remaining_output_filename, 'w') as f:
    for cluster in remaining_clusters_group2:
        f.write('\t'.join(map(str, cluster)) + '\n')
print(f"Group2 remaining clusters saved to: {clusters_group2_remaining_output_filename}")

# Print summary information
print("\nNumber of remaining clusters in group1:", len(remaining_clusters_group1))
if fixed_cluster_group1:
    print("Fixed cluster in group1:", fixed_cluster_group1)
else:
    print("Error: Fixed cluster not found in group1")

print("\nNumber of remaining clusters in group2:", len(remaining_clusters_group2))
if fixed_cluster_group2:
    print("Fixed cluster in group2:", fixed_cluster_group2)
else:
    print("Error: Fixed cluster not found in group2")

print("\nMerged fixed cluster:", fixed_cluster_merged)
print("Number of edges in merged fixed cluster:", len(edges_fixed_merged))
print("Number of edges in group1 remaining clusters:", len(edges_group1_remaining))
print("Number of edges in group2 remaining clusters:", len(edges_group2_remaining))
print("Total number of edges after merging:", len(edges_combined))

# Generate feature vectors
np.random.seed(random_seed_group1)
u1 = np.zeros(n_genes).reshape(-1, 1)
u1[:n_relevant_features] = np.random.uniform(-1, 1, n_relevant_features).reshape(-1, 1)

u2 = np.zeros(n_genes).reshape(-1, 1)
u2[n_genes - n_relevant_features:] = np.random.uniform(-1, 1, n_relevant_features).reshape(-1, 1)

print("\nShape of u1:", u1.shape)
print("Shape of u2:", u2.shape)




Edges saved to: edges_combined.txt
Fixed cluster saved to: fixed_cluster_merged.txt
Group1 remaining clusters saved to: clusters_group1_remaining.txt
Group2 remaining clusters saved to: clusters_group2_remaining.txt

Number of remaining clusters in group1: 3
Fixed cluster in group1: [20, 3, 0, 8, 7, 24, 4, 23, 2, 13]

Number of remaining clusters in group2: 3
Fixed cluster in group2: [26, 33, 27, 38, 48, 28, 49, 37, 35, 41]

Merged fixed cluster: [0, 2, 3, 4, 7, 8, 13, 20, 23, 24, 26, 27, 28, 33, 35, 37, 38, 41, 48, 49]
Number of edges in merged fixed cluster: 139
Number of edges in group1 remaining clusters: 29
Number of edges in group2 remaining clusters: 27
Total number of edges after merging: 195

Shape of u1: (50, 1)
Shape of u2: (50, 1)


In [3]:
import numpy as np
import pandas as pd

# Parameter settings
n_genes = 50
random_seed_v = 11
random_seed_epsilon = 42

# Read Excel file
excel_file = "pc_loadings.xlsx"
try:
    df_excel = pd.read_excel(excel_file)
    if df_excel.shape[1] >= 2:
        u1 = df_excel.iloc[:, 0].values.reshape(-1, 1)
        u2 = df_excel.iloc[:, 1].values.reshape(-1, 1)
        if u1.shape[0] != n_genes or u2.shape[0] != n_genes:
            raise ValueError(f"Excel file rows ({u1.shape[0]}) != n_genes ({n_genes}).")
        print("Successfully read u1 and u2.")
    else:
        raise ValueError("Excel file needs at least two columns.")
except FileNotFoundError:
    print(f"Error: File not found {excel_file}.")
    exit()
except Exception as e:
    print(f"Error occurred while reading Excel file: {e}")
    exit()

# Define parameters in the formula
d1 = 10
d2 = 5
c = 5

# Generate v1 and v2
np.random.seed(random_seed_v)
v1 = np.random.randn(100, 1)
v2 = np.random.randn(100, 1)

# Generate epsilon
np.random.seed(random_seed_epsilon)
epsilon = np.random.randn(n_genes, 100)

# Calculate X
X = d1 * u1 @ v1.T + d2 * u2 @ v2.T + c * epsilon

# Save matrix to text file
output_filename = "matrix_x.txt"
np.savetxt(output_filename, X, delimiter='\t', fmt='%g')

print(f"Matrix X saved to file: {output_filename}")
print("Shape of X:", X.shape)

Successfully read u1 and u2.
Matrix X saved to file: matrix_x.txt
Shape of X: (50, 100)
