In [None]:
import pandas as pd

file_path = 'data/Supplemental_Data_Raw_Genecounts.csv'

# Load the CSV file into a DataFrame.
df = pd.read_csv(file_path)
df = df.drop('Unnamed: 17', axis=1)
df = df.rename(columns={"Unnamed: 0": "mnra",})
df.set_index('mnra', inplace=True)
# Convert every non-index column to int
df = df.apply(pd.to_numeric, errors='coerce', downcast='integer')

# Pruning rows where all the values are 0.
# Assuming that you have columns 'x' and 'y', replace them with the actual names of your columns.
print(df.shape)
df = df[(df.loc[:, df.columns != 'index'] != 0).any(axis=1)]

In [None]:
columns_to_check = df.columns.difference(['Unnamed: 0'])

# Pruning rows where all the values in columns_to_check are 0.
df = df[(df[columns_to_check] != 0).any(axis=1)]


In [None]:
# Step 1: Calculate pairwise correlation
correlation_matrix = df.corr(method='pearson')



In [None]:
correlation_matrix

In [None]:
filtered_corr_matrix = correlation_matrix.drop(index='B2.3.4', columns='B2.3.4')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(filtered_corr_matrix, annot=False, cmap='viridis')
plt.title('sample-sample Correlation Heatmap')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(correlation_matrix, annot=False, cmap='viridis')
plt.title('sample-sample Correlation Heatmap')
plt.show()

In [None]:
# Calculate gene-gene correlation matrix in a single line
gene_correlation_matrix = df.T.corr(method='pearson')

In [None]:
gene_correlation_matrix

In [None]:
threshold = 0.8  # for example
filtered_corr_matrix = gene_correlation_matrix[(gene_correlation_matrix >= threshold) & (gene_correlation_matrix != 1.0)]


In [None]:
import networkx as nx

G = nx.Graph()

for gene1 in filtered_corr_matrix.index:
    for gene2 in filtered_corr_matrix.columns:
        correlation = filtered_corr_matrix.loc[gene1, gene2]
        if not np.isnan(correlation):
            G.add_edge(gene1, gene2, weight=correlation)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 8))
pos = nx.spring_layout(G)  # positions for all nodes
nx.draw(G, pos, with_labels=True, node_color="skyblue", node_size=1000, font_size=10)
plt.title("Initial Co-expression Network")
plt.show()


In [None]:
# Select first 100 genes for testing
subset_corr_matrix = gene_correlation_matrix.iloc[:100, :100]
threshold = 0.85  # for example
filtered_subset_corr_matrix = subset_corr_matrix[(subset_corr_matrix >= threshold) & (subset_corr_matrix != 1.0)]

G_subset = nx.Graph()

for gene1 in filtered_subset_corr_matrix.index:
    for gene2 in filtered_subset_corr_matrix.columns:
        correlation = filtered_subset_corr_matrix.loc[gene1, gene2]
        if not np.isnan(correlation):
            G_subset.add_edge(gene1, gene2, weight=correlation)


In [None]:

pos_subset = nx.spring_layout(G_subset)
fig, ax = plt.subplots(figsize=(8, 8))
nx.draw(G_subset, pos_subset, ax=ax, with_labels=False, node_color="skyblue", node_size=1, font_size=1)
plt.title("Initial Co-expression Network (Subset)")
plt.show()

In [None]:
import community  # Python Louvain method library

# First compute the best partition
partition = community.best_partition(G_subset)

# Create a new graph to represent the hierarchical structure
G_hierarchy = nx.Graph()

for node, mod_class in partition.items():
    G_hierarchy.add_edge(node, f"Module_{mod_class}")

# Generate layout and draw the hierarchical network
pos_hierarchy = nx.spring_layout(G_hierarchy)
fig, ax = plt.subplots(figsize=(8, 8))
nx.draw(G_hierarchy, pos_hierarchy, with_labels=True, node_color="skyblue", node_size=1000, font_size=10)
plt.title("Hierarchical Co-expression Network")
plt.show()
