In [153]:
import pandas as pd

In [154]:
ctl_s1 = pd.read_csv('./figure_s1_data/ctl_s1_genes.csv')
s1_s2 = pd.read_csv('./figure_s1_data/s1_s2_genes.csv')
s2_s3 = pd.read_csv('./figure_s1_data/s2_s3_genes.csv')
s3_s4 = pd.read_csv('./figure_s1_data/s3_s4_genes.csv')

In [155]:
print(f'columns: {ctl_s1.columns}')

columns: Index(['Gene Symbol', 'p-value', 'FDR', 'Odd ratio', 'Number of interactions',
       'microRNA 1', 'microRNA 2', 'microRNA 3', 'microRNA 4', 'microRNA 5',
       'microRNA 6', 'microRNA 7', 'microRNA 8', 'microRNA 9', 'microRNA 10',
       'microRNA 11'],
      dtype='object')


In [156]:
ctl_s1['Source'] = 'CTRL.S1'
s1_s2['Source'] = 'S1.S2'
s2_s3['Source'] = 'S2.S3'
s3_s4['Source'] = 'S3.S4'

In [157]:
merged_data = pd.concat([ctl_s1, s1_s2, s2_s3, s3_s4], ignore_index=True)
merged_data.drop(columns=['p-value', 'FDR', 'Odd ratio'], inplace=True)

In [158]:
# Concatenate miRNA columns into one
miRNA_columns = [col for col in merged_data.columns if 'microRNA' in col]
merged_data['Targeting miRNAs'] = merged_data[miRNA_columns].apply(lambda x: ', '.join(x.dropna().astype(str)), axis=1)

# Drop the original microRNA columns
merged_data.drop(columns=miRNA_columns, inplace=True)

# Reorder the columns
desired_order = ['Gene Symbol', 'Source', 'Number of interactions', 'Targeting miRNAs']
merged_data = merged_data[desired_order]

In [159]:
print(merged_data.columns)

Index(['Gene Symbol', 'Source', 'Number of interactions', 'Targeting miRNAs'], dtype='object')


In [160]:
print(merged_data.head())
print(f'shape of merged_data: {merged_data.shape}')

  Gene Symbol   Source  Number of interactions  \
0       NACC1  CTRL.S1                      11   
1        NFIC  CTRL.S1                      10   
2        CALR  CTRL.S1                       9   
3      CDKN1A  CTRL.S1                       9   
4      SETD1B  CTRL.S1                       9   

                                    Targeting miRNAs  
0  hsa-miR-6766-5p, hsa-miR-6756-5p, hsa-miR-6787...  
1  hsa-miR-4690-5p, hsa-miR-6766-5p, hsa-miR-6756...  
2  hsa-miR-320a, hsa-miR-1343-3p, hsa-miR-6766-5p...  
3  hsa-miR-6802-5p, hsa-miR-663a, hsa-miR-1233-5p...  
4  hsa-miR-3940-5p, hsa-miR-4758-5p, hsa-miR-1238...  
shape of merged_data: (2960, 4)


In [161]:
# Sort merged_data by 'Number of interactions' in descending order and keep >= 5 interactions
merged_data = merged_data.sort_values(by='Number of interactions', ascending=False)
merged_data = merged_data[merged_data['Number of interactions'] >= 5]
merged_data.shape

(143, 4)

In [162]:
# Save to CSV
merged_data.to_csv('./final_figures/figure_s1_final.csv', index=False)

In [163]:
gene_counts = merged_data['Gene Symbol'].value_counts()
print(gene_counts[gene_counts > 1])

Gene Symbol
MED28     3
LMNB2     2
PEX26     2
SETD1B    2
NFIC      2
LRRC58    2
KMT2D     2
FOXK1     2
YWHAZ     2
GIGYF1    2
Name: count, dtype: int64
