# miRNA data augmentation:
The following miRNA list will be used to update the data frame to improve models accuracy.

miRNA list: hsa-mir-29a, hsa-mir-125b-1, hsa-mir-125b-2, hsa-mir-145, hsa-mir-149, hsa-mir-607-5p, hsa-mir-1246, hsa-mir-4488, hsa-mir-6777-5p, hsa-mir-492, hsa-mir-200a, hsa-mir-338, hsa-mir-29c, hsa-mir-101, hsa-mir-148a, hsa-mir-92a, hsa-mir-424, hsa-mir-210

Each candidate is either directly or indirectly linked to E-cadherin regulation and EMT, processes intimately associated with colorectal cancer metastasis. Their expression levels may serve as discriminative features between tumors that are localized and those that have begun to disseminate systemically.

In [None]:
!pip install numpy==2.2.0
!pip install pandas==2.2.3
!pip install scikit-learn==1.6.0
!pip install matplotlib==3.9.3
!pip install seaborn==0.13.2

Collecting numpy==2.2.0
  Downloading numpy-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m601.9 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.0 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.0 wh

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

#Collecting additional miRNA CPM from stage I-II cohort files:

In [None]:
# Step 1: Mount Google Drive in Colab
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd

# Step 2: Define the top-level Drive folder
base_folder = '/content/drive/MyDrive/.../.../'

# Step 3: List of new miRNAs to extract
miRNA_list = [
    'hsa-mir-200a',
    'hsa-mir-200b', 'hsa-mir-200c',
    'hsa-mir-141', 'hsa-mir-210', 'hsa-mir-135b', 'hsa-mir-218-1',
    'hsa-mir-218-2', 'hsa-mir-429'
]

# Step 4: Function to extract CPMs from one .txt file
def extract_miRNA_cpm(file_path, miRNAs):
    """
    Reads a .txt file and extracts 'reads_per_million_miRNA_mapped' for the specified miRNAs.
    Returns a dict with all miRNAs, filling missing ones with 0.
    """
    try:
        df = pd.read_csv(file_path, sep='\t')

        # Ensure required columns are present
        if 'miRNA_ID' not in df.columns or 'reads_per_million_miRNA_mapped' not in df.columns:
            raise ValueError("Missing required columns")

        df_filtered = df[df['miRNA_ID'].isin(miRNAs)]
        grouped = df_filtered.groupby('miRNA_ID')['reads_per_million_miRNA_mapped'].sum()
        return {miRNA: float(grouped.get(miRNA, 0.0)) for miRNA in miRNAs}

    except Exception as e:
        print(f"Skipping file {file_path}: {e}")
        return None  # Skip invalid files

# Step 5: Recursively find valid .txt files
txt_file_paths = []
for root, _, files in os.walk(base_folder):
    for f in files:
        if f.endswith('.txt') and 'annotation' not in f.lower():
            txt_file_paths.append(os.path.join(root, f))

# Step 6: Process each file
data_rows = []
file_ids = []

for file_path in txt_file_paths:
    result = extract_miRNA_cpm(file_path, miRNA_list)
    if result is not None:
        data_rows.append(result)
        file_ids.append(os.path.basename(file_path))  # Change to file_path for full path as ID

# Step 7: Create final DataFrame
df_new_features_I_II = pd.DataFrame(data_rows, index=file_ids)
df_new_features_I_II.index.name = 'File_ID'

# Preview
df_new_features_I_II.head()

Mounted at /content/drive


Unnamed: 0_level_0,hsa-mir-200a,hsa-mir-200b,hsa-mir-200c,hsa-mir-141,hsa-mir-210,hsa-mir-135b,hsa-mir-218-1,hsa-mir-218-2,hsa-mir-429
File_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3cd62167-7962-44ea-8923-6e9c7fc97807.mirbase21.isoforms.quantification.txt,1066.081631,1034.097946,6698.111943,589.660582,290.508173,31.36624,9.014707,8.273772,174.243163
ef4cd175-6f73-4360-b2c6-71b424d64f53.mirbase21.isoforms.quantification.txt,2083.501186,2178.169163,13660.924822,2378.784704,1422.839555,143.143354,2.551335,1.611371,321.468283
98694eb1-1282-4426-8fb2-001ac8190323.mirbase21.isoforms.quantification.txt,3693.224731,2745.688985,12168.405539,3105.811604,719.753268,121.516788,1.475913,3.443796,573.145973
a6f1d4ee-b216-4b96-95a6-5705662254d7.mirbase21.isoforms.quantification.txt,3575.242417,4080.13581,17627.565659,3576.599655,69.490703,234.259676,7.871994,6.514754,585.513466
e3f4c57a-45e8-4dd6-96b1-e12ba2bdb415.mirbase21.isoforms.quantification.txt,2634.622522,1672.245559,12480.315015,2198.60103,396.189185,135.145322,9.958076,4.979038,233.303505


In [None]:
# Lets check whether the data frame has all the features and the correct number of rows
df_new_features_I_II.shape

(437, 9)

In [None]:
# Now we have to add the target variable column. We achieved this by establishing a for loop
stages = []
for i in range(437):
  stages.append('Stage I-II')
# Then we transform the stages list to a data frame
df_list = pd.DataFrame(stages, columns=['Stages'])
df_list.head()

In [None]:
# Finally, we assign the main data frame index to the stages index for appropiate concatenation
df_list.index = df_new_features_I_II.index
df_stages_I_II_new = pd.concat([df_new_features_I_II, df_list], axis=1)
df_stages_I_II_new['Stages'].head()

In [None]:
# Download the new stages I to II data frame
df_stages_I_II_new.to_csv('df_stages_I_II_new.csv')
from google.colab import files

files.download('df_stages_I_II_new.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Collecting additional miRNA CPM from stage III-IV cohort files:

In [None]:
# Step 1: Mount Google Drive in Colab
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd

# Step 2: Define the top-level Drive folder
base_folder = '/content/drive/MyDrive/.../.../'

# Step 3: List of new miRNAs to extract
miRNA_list = [
    'hsa-mir-200a',
    'hsa-mir-200b', 'hsa-mir-200c',
    'hsa-mir-141', 'hsa-mir-210', 'hsa-mir-135b', 'hsa-mir-218-1',
    'hsa-mir-218-2', 'hsa-mir-429'
]

# Step 4: Function to extract CPMs from one .txt file
def extract_miRNA_cpm(file_path, miRNAs):
    """
    Reads a .txt file and extracts 'reads_per_million_miRNA_mapped' for the specified miRNAs.
    Returns a dict with all miRNAs, filling missing ones with 0.
    """
    try:
        df = pd.read_csv(file_path, sep='\t')

        # Ensure required columns are present
        if 'miRNA_ID' not in df.columns or 'reads_per_million_miRNA_mapped' not in df.columns:
            raise ValueError("Missing required columns")

        df_filtered = df[df['miRNA_ID'].isin(miRNAs)]
        grouped = df_filtered.groupby('miRNA_ID')['reads_per_million_miRNA_mapped'].sum()
        return {miRNA: float(grouped.get(miRNA, 0.0)) for miRNA in miRNAs}

    except Exception as e:
        print(f"Skipping file {file_path}: {e}")
        return None  # Skip invalid files

# Step 5: Recursively find valid .txt files
txt_file_paths = []
for root, _, files in os.walk(base_folder):
    for f in files:
        if f.endswith('.txt') and 'annotation' not in f.lower():
            txt_file_paths.append(os.path.join(root, f))

# Step 6: Process each file
data_rows = []
file_ids = []

for file_path in txt_file_paths:
    result = extract_miRNA_cpm(file_path, miRNA_list)
    if result is not None:
        data_rows.append(result)
        file_ids.append(os.path.basename(file_path))  # Change to file_path for full path as ID

# Step 7: Create final DataFrame
df_new_features_III_IV = pd.DataFrame(data_rows, index=file_ids)
df_new_features_III_IV.index.name = 'File_ID'

# Preview
df_new_features_III_IV.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0_level_0,hsa-mir-200a,hsa-mir-200b,hsa-mir-200c,hsa-mir-141,hsa-mir-210,hsa-mir-135b,hsa-mir-218-1,hsa-mir-218-2,hsa-mir-429
File_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4af67b51-6646-4438-8c42-28c608750b8b.mirnaseq.isoforms.quantification.txt,14622.485405,23084.208039,48663.692453,32195.310446,3951.987235,2441.004808,0.110533,0.0,4349.241866
f46f56f5-5eda-4882-97a6-83f8a7b64ed7.mirbase21.isoforms.quantification.txt,3068.520336,3186.572129,21626.734858,1949.035156,833.091526,18.298029,14.40232,14.048164,507.268571
bfa61a22-dc64-4748-adbd-e2d99937147d.mirbase21.isoforms.quantification.txt,1682.675404,2026.59003,7943.453192,815.230821,1264.037088,24.134358,12.763362,8.586263,185.184796
caaf539c-4377-4f96-83fe-975a514fa2fd.mirbase21.isoforms.quantification.txt,2316.422328,2671.017427,7974.006983,1544.679975,751.422861,121.916964,10.757378,9.163693,515.955779
28a9df9a-7dcc-45d6-8652-c533af9b04f2.mirbase21.isoforms.quantification.txt,3302.15042,1952.933835,6328.969186,1556.858729,4094.300633,396.989828,10.061954,12.806124,308.261686


In [None]:
# Lets check whether the data frame has all the features and the correct number of rows
df_new_features_III_IV.shape

(410, 9)

In [None]:
# Now we have to add the target variable column. We achieved this by establishing a for loop
list_stages_III_IV = []
for i in range(410):
  list_stages_III_IV.append('Stage III-IV')
# Then we transform the stages list to a data frame
df_list_stages_III_IV = pd.DataFrame(list_stages_III_IV, columns=['Stages'])

In [None]:
# Finally, we assign the main data frame index to the stages index for appropiate concatenation
df_list_stages_III_IV.index = df_new_features_III_IV.index
df_stages_III_IV_new = pd.concat([df_new_features_III_IV, df_list_stages_III_IV], axis=1)
df_stages_III_IV_new['Stages'].head()

Unnamed: 0_level_0,hsa-mir-200a,hsa-mir-200b,hsa-mir-200c,hsa-mir-141,hsa-mir-210,hsa-mir-135b,hsa-mir-218-1,hsa-mir-218-2,hsa-mir-429,Stages
File_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4af67b51-6646-4438-8c42-28c608750b8b.mirnaseq.isoforms.quantification.txt,14622.485405,23084.208039,48663.692453,32195.310446,3951.987235,2441.004808,0.110533,0.0,4349.241866,Stage III-IV
f46f56f5-5eda-4882-97a6-83f8a7b64ed7.mirbase21.isoforms.quantification.txt,3068.520336,3186.572129,21626.734858,1949.035156,833.091526,18.298029,14.40232,14.048164,507.268571,Stage III-IV
bfa61a22-dc64-4748-adbd-e2d99937147d.mirbase21.isoforms.quantification.txt,1682.675404,2026.59003,7943.453192,815.230821,1264.037088,24.134358,12.763362,8.586263,185.184796,Stage III-IV
caaf539c-4377-4f96-83fe-975a514fa2fd.mirbase21.isoforms.quantification.txt,2316.422328,2671.017427,7974.006983,1544.679975,751.422861,121.916964,10.757378,9.163693,515.955779,Stage III-IV
28a9df9a-7dcc-45d6-8652-c533af9b04f2.mirbase21.isoforms.quantification.txt,3302.15042,1952.933835,6328.969186,1556.858729,4094.300633,396.989828,10.061954,12.806124,308.261686,Stage III-IV


In [None]:
# Download the new stages III to IV data frame
from google.colab import files
df_stages_III_IV_new.to_csv('df_stages_III_IV_new.csv')

files.download('df_stages_III_IV_new.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>