In [1]:
import pandas as pd
import xlsxwriter as writer

In [2]:
def load_selected_features(filepath):
    return pd.read_csv(filepath)

def rank_features(svm_features, xgboost_features):
    # Rank features separately for SVM and XGBoost
    svm_features['SVM_Rank'] = svm_features['Importance'].rank(ascending=False, method='min')
    xgboost_features['XGBoost_Rank'] = xgboost_features['Importance'].rank(ascending=False, method='min')
    
    # Combine the features and calculate the average rank
    combined_features = pd.merge(svm_features, xgboost_features, on='Feature', how='outer', suffixes=('_SVM', '_XGBoost'))
    combined_features['Average_Rank'] = combined_features[['SVM_Rank', 'XGBoost_Rank']].mean(axis=1)
    
    # Combine classifier names
    def combine_classifiers(row):
        classifiers = []
        if not pd.isna(row['Classifier_SVM']):
            classifiers.append('SVM')
        if not pd.isna(row['Classifier_XGBoost']):
            classifiers.append('XGBoost')
        return ', '.join(classifiers)
    
    combined_features['Classifier'] = combined_features.apply(combine_classifiers, axis=1)
    return combined_features[['Feature', 'Average_Rank', 'Classifier']]

In [3]:

# Load the selected features for each comparison
ctl_b_features = load_selected_features('../GSEA/miRNA/50_ctl_b.csv')
ctl_c_features = load_selected_features('../GSEA/miRNA/50_ctl_c.csv')
b_c_features = load_selected_features('../GSEA/miRNA/50_b_c.csv')

# Separate the features by classifier
ctl_b_svm_features = ctl_b_features[ctl_b_features['Classifier'] == 'SVM']
ctl_b_xgboost_features = ctl_b_features[ctl_b_features['Classifier'] == 'XGBoost']
ctl_c_svm_features = ctl_c_features[ctl_c_features['Classifier'] == 'SVM']
ctl_c_xgboost_features = ctl_c_features[ctl_c_features['Classifier'] == 'XGBoost']
b_c_svm_features = b_c_features[b_c_features['Classifier'] == 'SVM']
b_c_xgboost_features = b_c_features[b_c_features['Classifier'] == 'XGBoost']

# Rank and get features for each comparison
ranked_ctl_b_features = rank_features(ctl_b_svm_features, ctl_b_xgboost_features)
ranked_ctl_c_features = rank_features(ctl_c_svm_features, ctl_c_xgboost_features)
ranked_b_c_features = rank_features(b_c_svm_features, b_c_xgboost_features)

# Create the final table with all features
final_table = pd.concat([
    ranked_ctl_b_features.rename(columns={'Feature': 'CTL_B', 'Classifier': 'Classifier_B'}),
    ranked_ctl_c_features.rename(columns={'Feature': 'CTL_C', 'Classifier': 'Classifier_C'}),
    ranked_b_c_features.rename(columns={'Feature': 'B_C', 'Classifier': 'Classifier_BC'})
], axis=1)

# Save the final table to a CSV file
final_table.to_excel('./output/all_miRNA_comparisons.xlsx', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  svm_features['SVM_Rank'] = svm_features['Importance'].rank(ascending=False, method='min')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xgboost_features['XGBoost_Rank'] = xgboost_features['Importance'].rank(ascending=False, method='min')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  svm_features[

In [4]:
miRNAs = final_table['CTL_B'].tolist() + final_table['CTL_C'].tolist() + final_table['B_C'].tolist()
miRNA_counts = pd.Series(miRNAs).value_counts()

# Function to determine the color based on the columns each miRNA appears in
def determine_color(miRNA):
    in_ctl_b = miRNA in final_table['CTL_B'].values
    in_ctl_c = miRNA in final_table['CTL_C'].values
    in_b_c = miRNA in final_table['B_C'].values

    if in_ctl_b and in_ctl_c and in_b_c:
        return 'Red'
    elif in_ctl_b and in_ctl_c:
        return 'Blue'
    elif in_ctl_b and in_b_c:
        return 'Green'
    elif in_ctl_c and in_b_c:
        return 'Magenta'
    else:
        return 'No Color'

# Create a dictionary to store the miRNA and their associated colors
miRNA_color_dict = {miRNA: determine_color(miRNA) for miRNA in miRNA_counts.index}

# Print each unique miRNA and its associated color
for miRNA, color in miRNA_color_dict.items():
    print(f"miRNA: {miRNA}, Color: {color}")

# Save the miRNA color dictionary to a new Excel file for reference
miRNA_color_df = pd.DataFrame(list(miRNA_color_dict.items()), columns=['miRNA', 'Color'])
miRNA_color_df.to_excel('./output/miRNA_color_mapping.xlsx', index=False)

miRNA: hsa-miR-1203, Color: Blue
miRNA: hsa-miR-4730, Color: Blue
miRNA: hsa-miR-4734, Color: Blue
miRNA: hsa-miR-4783-3p, Color: Blue
miRNA: hsa-miR-5572, Color: Green
miRNA: hsa-miR-575, Color: Blue
miRNA: hsa-miR-6131, Color: Blue
miRNA: hsa-miR-663a, Color: Blue
miRNA: hsa-miR-6746-5p, Color: Blue
miRNA: hsa-miR-6787-5p, Color: Blue
miRNA: hsa-miR-6802-5p, Color: Blue
miRNA: hsa-miR-6805-5p, Color: Blue
miRNA: hsa-miR-7110-5p, Color: Green
miRNA: hsa-miR-8069, Color: Blue
miRNA: hsa-miR-8073, Color: Blue
miRNA: hsa-miR-92a-2-5p, Color: Blue
miRNA: hsa-miR-92b-5p, Color: Blue
miRNA: hsa-miR-1228-5p, Color: Blue
miRNA: hsa-miR-5100, Color: Magenta
miRNA: hsa-miR-4732-5p, Color: Blue
miRNA: hsa-miR-6784-5p, Color: Blue
miRNA: hsa-miR-4706, Color: Blue
miRNA: hsa-miR-4419b, Color: Blue
miRNA: hsa-miR-4687-5p, Color: Blue
miRNA: hsa-miR-1307-3p, Color: Blue
miRNA: hsa-miR-1343-3p, Color: Blue
miRNA: hsa-miR-1469, Color: Blue
miRNA: hsa-miR-1238-5p, Color: Blue
miRNA: hsa-miR-1233-5p, Co