In [14]:
import pandas as pd

In [15]:
def load_selected_features(filepath):
    return pd.read_csv(filepath)

# Function to rank features based on their importance and calculate average rank
def rank_features(svm_features, xgboost_features, top_n=15):
    # Rank features separately for SVM and XGBoost
    svm_features['SVM_Rank'] = svm_features['Importance'].rank(ascending=False, method='min')
    xgboost_features['XGBoost_Rank'] = xgboost_features['Importance'].rank(ascending=False, method='min')
    
    # Combine the features and calculate the average rank
    combined_features = pd.merge(svm_features, xgboost_features, on='Feature', how='outer', suffixes=('_SVM', '_XGBoost'))
    combined_features['Average_Rank'] = combined_features[['SVM_Rank', 'XGBoost_Rank']].mean(axis=1)
    
    # Select top features based on average rank
    top_features = combined_features.nsmallest(top_n, 'Average_Rank')
    
    # Combine classifier names
    def combine_classifiers(row):
        classifiers = []
        if not pd.isna(row['Classifier_SVM']):
            classifiers.append('SVM')
        if not pd.isna(row['Classifier_XGBoost']):
            classifiers.append('XGBoost')
        return ', '.join(classifiers)
    
    top_features['Classifier'] = top_features.apply(combine_classifiers, axis=1)
    return top_features[['Feature', 'Average_Rank', 'Classifier']]

In [16]:
# Load the selected features for each comparison
ctl_s1_features = load_selected_features('../GSEA/miRNA/50_ctl_s1.csv')
s1_s2_features = load_selected_features('../GSEA/miRNA/50_s1_s2.csv')
s2_s3_features = load_selected_features('../GSEA/miRNA/50_s2_s3.csv')
s3_s4_features = load_selected_features('../GSEA/miRNA/50_s3_s4.csv')

# Separate the features by classifier
ctl_s1_svm_features = ctl_s1_features[ctl_s1_features['Classifier'] == 'SVM']
ctl_s1_xgboost_features = ctl_s1_features[ctl_s1_features['Classifier'] == 'XGBoost']
s1_s2_svm_features = s1_s2_features[s1_s2_features['Classifier'] == 'SVM']
s1_s2_xgboost_features = s1_s2_features[s1_s2_features['Classifier'] == 'XGBoost']
s2_s3_svm_features = s2_s3_features[s2_s3_features['Classifier'] == 'SVM']
s2_s3_xgboost_features = s2_s3_features[s2_s3_features['Classifier'] == 'XGBoost']  
s3_s4_svm_features = s3_s4_features[s3_s4_features['Classifier'] == 'SVM']
s3_s4_xgboost_features = s3_s4_features[s3_s4_features['Classifier'] == 'XGBoost']

# Rank and get top 20 features for each comparison
top_ctl_s1_features = rank_features(ctl_s1_svm_features, ctl_s1_xgboost_features)
top_s1_s2_features = rank_features(s1_s2_svm_features, s1_s2_xgboost_features)
top_s2_s3_features = rank_features(s2_s3_svm_features, s2_s3_xgboost_features)
top_s3_s4_features = rank_features(s3_s4_svm_features, s3_s4_xgboost_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  svm_features['SVM_Rank'] = svm_features['Importance'].rank(ascending=False, method='min')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xgboost_features['XGBoost_Rank'] = xgboost_features['Importance'].rank(ascending=False, method='min')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  svm_features[

In [17]:
# Create the final table
final_table = pd.DataFrame()

# Add top features and classifiers for ctl_s1
final_table['ctl_s1'] = top_ctl_s1_features['Feature'].values
final_table['Classifier'] = top_ctl_s1_features['Classifier'].values

# Add top features and classifiers for s1_s2
final_table['s1_s2'] = top_s1_s2_features['Feature'].values
final_table['Classifier1'] = top_s1_s2_features['Classifier'].values
# rename Classifier1 to Classifier
final_table = final_table.rename(columns={'Classifier1': 'Classifier'})

# Add top features and classifiers for s2_s3
final_table['s2_s3'] = top_s2_s3_features['Feature'].values
final_table['Classifier2'] = top_s2_s3_features['Classifier'].values
# rename Classifier2 to Classifier
final_table = final_table.rename(columns={'Classifier2': 'Classifier'})

# Add top features and classifiers for s3_s4
final_table['s3_s4'] = top_s3_s4_features['Feature'].values
final_table['Classifier3'] = top_s3_s4_features['Classifier'].values
# rename Classifier2 to Classifier
final_table = final_table.rename(columns={'Classifier3': 'Classifier'})

# Save the final table to a CSV file
# final_table.to_excel('./output/table2.xlsx', index=False)

In [18]:
# Load the final table from Excel
final_table = pd.read_excel('./output/table2.xlsx', skiprows=1)
final_table.columns

Index(['Unnamed: 0', 'ctl_s1', 'Classifier', 's1_s2', 'Classifier.1', 's2_s3',
       'Classifier.2', 's3_s4', 'Classifier.3'],
      dtype='object')

In [21]:
# Flatten all miRNA values into a single list and count occurrences
miRNAs = final_table['ctl_s1'].tolist() + final_table['s1_s2'].tolist() + final_table['s2_s3'].tolist() + final_table['s3_s4'].tolist()
miRNA_counts = pd.Series(miRNAs).value_counts()

# Function to determine the color based on the columns each miRNA appears in
def determine_color(miRNA):
    in_ctl_s1 = miRNA in final_table['ctl_s1'].values
    in_s1_s2 = miRNA in final_table['s1_s2'].values
    in_s2_s3 = miRNA in final_table['s2_s3'].values
    in_s3_s4 = miRNA in final_table['s3_s4'].values

    count = sum([in_ctl_s1, in_s1_s2, in_s2_s3, in_s3_s4])

    if count == 4:
        return 'Red'
    elif count == 3:
        return 'Blue'
    elif count == 2:
        return 'Green'
    else:
        return 'No Color'

# Create a dictionary to store the miRNA and their associated colors
miRNA_color_dict = {miRNA: determine_color(miRNA) for miRNA in miRNA_counts.index}

# Print each unique miRNA and its associated color
for miRNA, color in miRNA_color_dict.items():
    print(f"miRNA: {miRNA}, Color: {color}")

miRNA: hsa-miR-548h-5p, Color: Blue
miRNA: hsa-miR-4749-5p, Color: Green
miRNA: hsa-miR-139-3p, Color: Green
miRNA: hsa-miR-6769b-5p, Color: Green
miRNA: hsa-miR-4524b-5p, Color: No Color
miRNA: hsa-miR-1229-3p, Color: No Color
miRNA: hsa-miR-6774-3p, Color: No Color
miRNA: hsa-miR-660-5p, Color: No Color
miRNA: hsa-miR-6847-3p, Color: No Color
miRNA: hsa-miR-3130-3p, Color: No Color
miRNA: hsa-miR-1227-5p, Color: No Color
miRNA: hsa-miR-134-3p, Color: No Color
miRNA: hsa-miR-4722-3p, Color: No Color
miRNA: hsa-miR-5587-5p, Color: No Color
miRNA: hsa-miR-6829-3p, Color: No Color
miRNA: hsa-miR-7846-3p, Color: No Color
miRNA: hsa-miR-6715b-5p, Color: No Color
miRNA: hsa-miR-4511, Color: No Color
miRNA: hsa-miR-29b-1-5p, Color: No Color
miRNA: hsa-miR-5582-5p, Color: No Color
miRNA: hsa-miR-1225-3p, Color: No Color
miRNA: hsa-miR-3975, Color: No Color
miRNA: hsa-miR-4474-3p, Color: No Color
miRNA: hsa-miR-512-5p, Color: No Color
miRNA: hsa-miR-4516, Color: No Color
miRNA: hsa-miR-1250-5p