In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Change the working directory to the location where your data files are stored
os.chdir('../../')  # Replace with the actual path

# Verify the current working directory
print("Current working directory:", os.getcwd())

# Load the counts data
file_path = 'Data/RAW/raw_count_matrix/raw_count_matrix2.txt'
counts_df = pd.read_csv(file_path, delimiter='\t')
counts_df = counts_df.set_index('Gene')

# Replace hyphens with dots in the column names of the counts DataFrame
counts_df.columns = counts_df.columns.str.replace("-", ".")

# Transpose the counts DataFrame to align conditions as row indices
counts_df_t = counts_df.transpose()

# Load the metadata
file_path = 'Data/RAW/metadata.txt'
metadata_df = pd.read_csv(file_path, delimiter='\t')
metadata_df.columns = ['Sample', 'Treatment', 'Time']
metadata_df = metadata_df.set_index('Sample')

# Merge the transposed counts DataFrame with the metadata DataFrame based on the condition identifiers
merged_df = counts_df_t.merge(metadata_df, left_index=True, right_index=True)

# Find the maximum count for each gene across all time points and samples per condition
max_counts_per_condition = merged_df.groupby('Treatment').max()

# Transpose back to the original format
result_df = max_counts_per_condition.transpose()





Current working directory: c:\Users\alexa\OneDrive\Dokumente\GitHub\DL-GRN


In [6]:
# Log10 transform the values (adding 1 to each count to handle zeros)
log10_transformed_df = np.log10(result_df + 1)

# Save the transformed data to a new CSV file if needed
log10_transformed_df.to_csv('Data/Processed/log10_transformed_max_counts_per_condition.csv')

# Create a new DataFrame to store gene names and count levels
count_levels = pd.DataFrame(index=log10_transformed_df.index)

# Determine the quartiles for each condition and label the genes
for condition in log10_transformed_df.columns:
    data = log10_transformed_df[condition]
    
    # Calculate quartiles
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    
    # Label the genes based on quartile ranges
    count_levels[condition] = np.where(data <= q1, 0, 
                                       np.where(data >= q3, 1, ''))

# Combine the count levels into a single column
#count_levels['count_level'] = count_levels.apply(lambda row: '0' if 'low' in row.values else ('0' if 'high' in row.values else ''), axis=1)

# Filter out genes that are in the middle quartile (empty count_level)
#final_count_levels = count_levels[count_levels['count_level'] != '']

# Keep only the gene names and the final count_level
#final_count_levels = final_count_levels[['count_level']]

# Save the result to a new CSV file

count_levels.to_csv('Data/Processed/gene_count_levels.csv')

#sanity check
for condition in count_levels.columns:
    print(count_levels[condition].value_counts())
    print('\n')

# How often are genes in different count levels across conditions?
def has_mix_of_0s_and_1s(row):
    unique_values = set(row.dropna().values)
    return 0 in unique_values and 1 in unique_values

# Apply the function to each row and count occurrences
mix_count = count_levels.apply(has_mix_of_0s_and_1s, axis=1).sum()

# Output the count
print(f"Number of rows with a mix of 0s and 1s: {mix_count}")

ABA
     16470
0     8681
1     8390
Name: count, dtype: int64


ABA + MeJA
     16600
0     8555
1     8386
Name: count, dtype: int64


MeJA
     16536
0     8616
1     8389
Name: count, dtype: int64


Mock
     16723
0     8431
1     8387
Name: count, dtype: int64


SA
     16738
0     8417
1     8386
Name: count, dtype: int64


SA + MeJA
     16652
0     8502
1     8387
Name: count, dtype: int64


Number of rows with a mix of 0s and 1s: 0
